From be87154a2f83f25c269eb3ce2bcca0b82356a8c5 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 16 Feb 2022 13:47:41 -0700 Subject: [PATCH 001/261] improve Kokkos::Experimental::Controls::getParameter ergonomics and add unit tests --- src/common/KokkosKernels_Controls.hpp | 25 ++++---- unit_test/common/Test_Common.hpp | 1 + unit_test/common/Test_Common_Controls.hpp | 72 +++++++++++++++++++++++ 3 files changed, 83 insertions(+), 15 deletions(-) create mode 100644 unit_test/common/Test_Common_Controls.hpp diff --git a/src/common/KokkosKernels_Controls.hpp b/src/common/KokkosKernels_Controls.hpp index c5a47a24b3..a1a4fb59ea 100644 --- a/src/common/KokkosKernels_Controls.hpp +++ b/src/common/KokkosKernels_Controls.hpp @@ -81,28 +81,23 @@ class Controls { // check if a parameter is already set bool isParameter(const std::string& name) const { - bool return_value = false; - - auto search = kernel_parameters.find(name); - if (search != kernel_parameters.end()) { - return_value = true; - } - - return return_value; + return kernel_parameters.end() != kernel_parameters.find(name); } - // retrieve the value associated with a parameter if it is already set - std::string getParameter(const std::string& name) const { + /// \brief get the value associated with \c name, or \c default if not present + /// + /// \param name the name of the parameter to retrieve + /// \param orUnset (default \c "" ) the value to return if \c name is not set + std::string getParameter(const std::string& name, + const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); - std::string value; - if (search == kernel_parameters.end()) { + if (kernel_parameters.end() == search) { std::cout << "Parameter " << name << " was not found in the list of parameters!" << std::endl; - value = ""; + return orUnset; } else { - value = search->second; + return search->second; } - return value; } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp index 0a194071a8..9d6958e816 100644 --- a/unit_test/common/Test_Common.hpp +++ b/unit_test/common/Test_Common.hpp @@ -11,5 +11,6 @@ #include #include #include +#include #endif // TEST_COMMON_HPP diff --git a/unit_test/common/Test_Common_Controls.hpp b/unit_test/common/Test_Common_Controls.hpp new file mode 100644 index 0000000000..48c2a96715 --- /dev/null +++ b/unit_test/common/Test_Common_Controls.hpp @@ -0,0 +1,72 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_COMMON_CONTROLS_HPP +#define TEST_COMMON_CONTROLS_HPP + +#include "KokkosKernels_Controls.hpp" + +void test_controls_empty() { + KokkosKernels::Experimental::Controls c; + EXPECT_EQ(c.isParameter(""), false); + EXPECT_EQ(c.getParameter(""), ""); + EXPECT_EQ(c.getParameter("", "default"), "default"); +} + +void test_controls_set() { + KokkosKernels::Experimental::Controls c; + c.setParameter("key", "value"); + EXPECT_EQ(c.isParameter("key"), true); + EXPECT_EQ(c.getParameter("key"), "value"); + EXPECT_EQ(c.getParameter("key", "default"), "value"); + + EXPECT_EQ(c.isParameter(""), false); + EXPECT_EQ(c.getParameter(""), ""); + EXPECT_EQ(c.getParameter("", "default"), "default"); +} + +TEST_F(TestCategory, controls_empty) { test_controls_empty(); } +TEST_F(TestCategory, controls_set) { test_controls_set(); } + +#endif // TEST_COMMON_CONTROLS_HPP From 78f23d00e1e3e29db3ffce338b87aa155e5ca43b Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 16 Feb 2022 16:12:38 -0700 Subject: [PATCH 002/261] prevent tensor-core instantiation on non-GPU exec spaces. Disallow tensor cores except for non-transpose --- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 60 +++--- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 172 +++++++++++------- 2 files changed, 129 insertions(+), 103 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index b87a9fa460..8f7eeb821e 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -46,6 +46,7 @@ #define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ #include "KokkosKernels_Error.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #if defined(KOKKOS_ENABLE_CUDA) && \ (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE)) @@ -320,10 +321,8 @@ struct BsrMatrixSpMVTensorCoreFunctor { // no barrier - each warp uses independent shared memory // load from the shared memory -#ifdef __CUDA_ARCH__ load_matrix_sync(fy, &sy(warpIdx_y, warpIdx_x, 0, 0), FRAG_N, nvcuda::wmma::mem_row_major); -#endif auto rowView = a.block_row_Const(blockIdx_i); @@ -363,17 +362,12 @@ struct BsrMatrixSpMVTensorCoreFunctor { const AOrdinal bj = bk + tj; // fill shmem with 0 outside of the block boundary -#ifdef __CUDA_ARCH__ if (bi < a.blockDim() && bj < a.blockDim()) { sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(alpha * ap[bi * a.blockDim() + bj]); } else { sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(0); } -#else - (void)bi; - (void)bj; -#endif } // collaborative load of X fragments into shared memory @@ -391,7 +385,6 @@ struct BsrMatrixSpMVTensorCoreFunctor { // load 0 outside of the block boundary // x is not necessarily a multiple of block size, so make sure access // is in bounds -#ifdef __CUDA_ARCH__ if (bi < a.blockDim() && bj < a.blockDim() && unsigned(blockIdx_j * a.blockDim() + bj) < x.extent(1)) { // tile is some fragments in the j/n direction that are frag_n wide @@ -400,15 +393,10 @@ struct BsrMatrixSpMVTensorCoreFunctor { } else { sx(tj / FRAG_N, ti, tj % FRAG_N) = XFragScalar(0); } -#else - (void)bi; - (void)bj; -#endif } mbr.team_barrier(); // load correct fragment from shared memory and accumulate -#ifdef __CUDA_ARCH__ // only need to do any math if our fragment will write a result back to // Y if (ay_i < static_cast(y.extent(0)) && @@ -417,17 +405,12 @@ struct BsrMatrixSpMVTensorCoreFunctor { load_matrix_sync(fx, &sx(warpIdx_x, 0, 0), FRAG_N); mma_sync(fy, fa, fx, fy); } -#endif } - (void)j; - (void)ap; } // loop through blocks in row of A -#ifdef __CUDA_ARCH__ // store Y fragments into shared memory store_matrix_sync(&sy(warpIdx_y, warpIdx_x, 0, 0), fy, FRAG_N, nvcuda::wmma::mem_row_major); -#endif // team loads its fragments of Y that make up part or all of the block of Y // it's responsible for. each warp loads the part corresponding to its y // fragment @@ -447,21 +430,16 @@ struct BsrMatrixSpMVTensorCoreFunctor { } } mbr.team_barrier(); - - // Suppress unused var warnings - // TODO (@cwpearson): Should this functor only compile on device? - (void)fx; - (void)fa; - (void)fy; } }; -/* Instantiate some common template parameter values - for BsrMatrixSpMVTensorCoreFunctor. - This is a struct instead of a function for template...using shorthand - Discriminates between complex (supported) and non-complex (unsupported) - scalar types, and throws a runtime error for unsupported types -*/ +/// \brief Avoid instantiating tensor core functor for unsupported types +/// +/// Instantiate some common template parameter values +/// for BsrMatrixSpMVTensorCoreFunctor. +/// This is a struct instead of a function for template...using shorthand +/// Discriminates between non-complex/on-GPU (supported) and otherwise +/// (unsupported) scalar types, and throws a runtime error for unsupported types template struct none_complex { const static bool value = !Kokkos::ArithTraits::is_complex && @@ -528,11 +505,22 @@ struct BsrMatrixSpMVTensorCoreDispatcher { !Kokkos::ArithTraits::is_complex; }; + /*true if T1::execution_space, T2, or T3 are all GPU exec space*/ + template + struct all_gpu { + const static bool value = KokkosKernels::Impl::kk_is_gpu_exec_space() && + KokkosKernels::Impl::kk_is_gpu_exec_space() && + KokkosKernels::Impl::kk_is_gpu_exec_space(); + }; + static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta, YMatrix y) { - using tag = - std::integral_constant::value>; + // tag will be false unless all conditions are met + using tag = std::integral_constant< + bool, none_complex::value && + all_gpu::value>; tag_dispatch(tag{}, alpha, a, x, beta, y); } }; diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 4d6d6cd1b5..089c9d4c71 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -201,88 +201,126 @@ struct SPMV_MV_BSRMATRIX YVector; typedef typename YVector::non_const_value_type YScalar; + enum class Method { + Fallback, ///< Don't use tensor cores + TensorCores ///< use tensor cores + }; + + /// Precision to use in the tensor core implementation + enum class Precision { + Automatic, ///< Use Double, unless operations match mixed precision + Double, ///< fp64 += fp64 * fp64 + Mixed ///< fp32 += fp16 * fp16 + }; + static void spmv_mv_bsrmatrix( const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { + Method method = Method::Fallback; + #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - // user explicitly requests a particular precision - bool requestMixed = false; - bool requestDouble = false; - if (controls.isParameter("tc_precision")) { - if (controls.getParameter("tc_precision") == "mixed") { - requestMixed = true; - } else if (controls.getParameter("tc_precision") == "double") { - requestDouble = true; - } - } - // - bool use_tc = false; - if ((controls.isParameter("algorithm")) && - (controls.getParameter("algorithm") == "experimental_bsr_tc")) { - if (Kokkos::Details::ArithTraits::is_complex == false) - use_tc = true; + { + typedef typename AMatrix::non_const_value_type AScalar; + typedef typename XVector::non_const_value_type XScalar; + // try to use tensor cores if requested + if (controls.getParameter("algorithm") == "experimental_bsr_tc") + method = Method::TensorCores; + // can't use tensor cores for complex + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + // can't use tensor cores outside GPU + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename AMatrix::execution_space>()) + method = Method::Fallback; + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename XVector::execution_space>()) + method = Method::Fallback; + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename YVector::execution_space>()) + method = Method::Fallback; + // can't use tensor cores unless mode is no-transpose + if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback; +#if KOKKOS_HALF_T_IS_FLOAT + // disable tensor cores when Kokkos half is actually a float + method = Method::Fallback; +#endif } #endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_AMPERE) - typedef typename XVector::non_const_value_type XScalar; - typedef typename AMatrix::non_const_value_type AScalar; - typedef Kokkos::Experimental::half_t Half; - - /* Ampere has double += double * double and float += half * half - - use whichever is requested. - If none requested, used mixed precision if the inputs are mixed, otherwise - use double - */ - - // input precision matches a tensor core fragment type - constexpr bool operandsHalfHalfFloat = std::is_same::value && - std::is_same::value && - std::is_same::value; - - if (use_tc) { - if (requestMixed) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - return; - } else if (requestDouble) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); - return; - } else if (operandsHalfHalfFloat) { + { + typedef Kokkos::Experimental::half_t Half; + typedef typename AMatrix::non_const_value_type AScalar; + typedef typename XVector::non_const_value_type XScalar; + + /* Ampere has double += double * double and float += half * half + + use whichever is requested. + If none requested, used mixed precision if the inputs are mixed, otherwise + use double + */ + if (Method::TensorCores == method) { + Precision precision = Precision::Automatic; + if (controls.getParameter("tc_precision") == "mixed") + precision = Precision::Mixed; + else if (controls.getParameter("tc_precision") == "double") + precision = Precision::Double; + + switch (precision) { + case Precision::Mixed: { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + case Precision::Double: { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + case Precision::Automatic: // fallthrough + default: { + constexpr bool operandsHalfHalfFloat = + std::is_same::value && + std::is_same::value && + std::is_same::value; + if (operandsHalfHalfFloat) { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } else { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + } + } + } + } +#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA) + { + /* Volta has float += half * half + use it for all matrices + */ + if (Method::TensorCores == method) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); return; - } else { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); - return; - } - } -#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA) - /* Volta has float += half * half - use it for all matrices - */ - if (use_tc) { - if (requestDouble) { - KokkosKernels::Impl::throw_runtime_exception( - "KokkosSparse::spmv[algorithm=experimental_bsr_tc] " - "tc_precision=double unsupported KOKKOS_ARCH_VOLTA"); } - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - (void)requestMixed; // unused - return; } #endif // KOKKOS_ARCH From 3f1b7babd354f89dc27842d894546cd38f042e63 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 16 Feb 2022 16:49:00 -0700 Subject: [PATCH 003/261] improve error message for unsupported tensor core invocation --- src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 8f7eeb821e..69a95f6f9e 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -494,7 +494,9 @@ struct BsrMatrixSpMVTensorCoreDispatcher { // to be used to avoid instantiating on unsupported types static void tag_dispatch(std::false_type, YScalar, AMatrix, XMatrix, YScalar, YMatrix) { - KokkosKernels::Impl::throw_runtime_exception("unsupported for arguments"); + KokkosKernels::Impl::throw_runtime_exception( + "Tensor core SpMV is only supported for non-complex types in GPU " + "execution spaces"); } /*true if none of T1, T2, or T3 are complex*/ From 30157b103e713ccdd3028eba9b425d3106fc8c20 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Wed, 16 Feb 2022 16:49:23 -0700 Subject: [PATCH 004/261] fix unused variable when CUDA not enabled --- src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 089c9d4c71..52bbb2f839 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -217,9 +217,8 @@ struct SPMV_MV_BSRMATRIX Date: Tue, 22 Feb 2022 08:16:55 -0500 Subject: [PATCH 005/261] Fixup prefer std::fabs on the host-side and drop pointless cast --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index e3d991c7c1..aa78e0bf97 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1574,8 +1574,8 @@ static inline bool __gemm_print_compare_failure(ViewType h_expected, ViewType h_actual, int i, int j, int k, double epsilon) { STATUS; - auto diff = static_cast(Kokkos::Experimental::fabs( - static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); + auto diff = + std::fabs(static_cast(h_expected(i, j, k) - h_actual(i, j, k))); if (diff > epsilon) { printf( From aa6b100a428519039e2842133f8ea05e8bcea92f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 22 Feb 2022 08:17:51 -0500 Subject: [PATCH 006/261] Fixup conditionally use sqrt from Kokkos:: or Kokkos::Experimental:: depending on KOKKOS_VERSION --- .../impl/KokkosBatched_SVD_Serial_Internal.hpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 0c7007bdf3..01e69307d4 100644 --- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -37,10 +37,15 @@ struct SerialSVDInternal { KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2) { - value_type a = Kokkos::ArithTraits::one(); - value_type b = -a11 - a22; - value_type c = a11 * a22 - a21 * a21; - value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c); + value_type a = Kokkos::ArithTraits::one(); + value_type b = -a11 - a22; + value_type c = a11 * a22 - a21 * a21; +#if KOKKOS_VERSION >= 30600 + using Kokkos::sqrt; +#else + using Kokkos::Experimental::sqrt; +#endif + value_type sqrtDet = sqrt(b * b - 4 * a * c); e1 = (-b + sqrtDet) / (2 * a); e2 = (-b - sqrtDet) / (2 * a); } From 2989f2df02fc69188b922c24969aa1cf68fc6576 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 22 Feb 2022 08:21:25 -0500 Subject: [PATCH 007/261] Adjust Kokkos version for math functions in Kokkos:: namespace --- src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 01e69307d4..446ba50c03 100644 --- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -40,7 +40,7 @@ struct SerialSVDInternal { value_type a = Kokkos::ArithTraits::one(); value_type b = -a11 - a22; value_type c = a11 * a22 - a21 * a21; -#if KOKKOS_VERSION >= 30600 +#if KOKKOS_VERSION >= 30699 using Kokkos::sqrt; #else using Kokkos::Experimental::sqrt; From 5abbd09b2d432cb2ea3fe0da34af6f9accbd3860 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 22 Feb 2022 10:17:45 -0500 Subject: [PATCH 008/261] Cleanup prefer std::sqrt on the host-side --- unit_test/batched/dense/Test_Batched_SerialSVD.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp index 57ec7f645b..d30da1726c 100644 --- a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp @@ -31,7 +31,7 @@ double simpleNorm2(const Vector& v) { double m = KAT::abs(vhost(i)); d += m * m; } - return Kokkos::Experimental::sqrt(d); + return std::sqrt(d); } template From 189525e081f3c0f73f256ebd41b6bbb3fb73a650 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 21 Feb 2022 10:45:03 -0700 Subject: [PATCH 009/261] Reduce lots of macro duplication in sparse unit tests Use a new include file, Test_Common_Test_All_Type_Combos.hpp, to test all SCALAR, ORDINAL, and OFFSET type combinations for EXECUTE_TEST. --- .../Test_Common_Test_All_Type_Combos.hpp | 188 +++++++ .../sparse/Test_Sparse_BlockCrsMatrix.hpp | 134 +---- unit_test/sparse/Test_Sparse_BsrMatrix.hpp | 134 +---- unit_test/sparse/Test_Sparse_CrsMatrix.hpp | 128 +---- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 128 +---- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 128 +---- .../sparse/Test_Sparse_replaceSumInto.hpp | 128 +---- .../Test_Sparse_replaceSumIntoLonger.hpp | 132 +---- unit_test/sparse/Test_Sparse_spadd.hpp | 128 +---- unit_test/sparse/Test_Sparse_spgemm.hpp | 128 +---- .../sparse/Test_Sparse_spgemm_jacobi.hpp | 128 +---- unit_test/sparse/Test_Sparse_spiluk.hpp | 135 +---- unit_test/sparse/Test_Sparse_spmv.hpp | 477 +----------------- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 273 +--------- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 271 +--------- unit_test/sparse/Test_Sparse_sptrsv.hpp | 128 +---- unit_test/sparse/Test_Sparse_trsv.hpp | 316 +----------- 17 files changed, 279 insertions(+), 2805 deletions(-) create mode 100644 unit_test/common/Test_Common_Test_All_Type_Combos.hpp diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp new file mode 100644 index 0000000000..60e0651e69 --- /dev/null +++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp @@ -0,0 +1,188 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Common_Test_All_Type_Combos.hpp + +/** + * EXECUTE_TEST should take (SCALAR, ORDINAL, OFFSET, DEVICE). All + * these args are types. + * #define NO_TEST_COMPLEX to skip testing of kokkos complex types + */ + +#if !defined(EXECUTE_TEST) +#error Test_Common_Test_All_Type_Combos.hpp requires EXECUTE_TEST to be set +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +// ETI is off, test all possible type combos + +EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(float, int, int, TestExecSpace) +EXECUTE_TEST(float, int64_t, int, TestExecSpace) +EXECUTE_TEST(float, int, size_t, TestExecSpace) +EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) + +# if !defined(NO_TEST_COMPLEX) + +EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) + +# endif + +#else + +// ETI is on, only test instantiated type combos + +# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(double, int, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(double, int64_t, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(double, int, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(float, int, int, TestExecSpace) +#endif + +# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(float, int64_t, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(float, int, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +# endif + +# if !defined(NO_TEST_COMPLEX) + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +# endif + +# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +# endif + +# endif // !NO_TEST_COMPLEX + +#endif // ETI ON diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp index e87514c3c6..d7a11ac934 100644 --- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp @@ -372,139 +372,13 @@ void testBlockCrsMatrix() { } } -#define EXECUTE_BLOCKCRS_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##blkcrsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBlockCrsMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BLOCKCRS_TEST +#include + +#undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp index 49a0ce6d4f..26748690ac 100644 --- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp @@ -374,138 +374,12 @@ void testBsrMatrix() { } } -#define EXECUTE_BSR_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBsrMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BSR_TEST +#include + +#undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 652b9fb8e3..e1600253ee 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -256,132 +256,6 @@ void testCrsMatrixHostMirror() { testCrsMatrixHostMirror(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index a3e2c1e1a9..d505e05608 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -418,132 +418,6 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, 500, 500 * 10, 70, 3); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index fc4ee67310..a9fe79ad8a 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -741,132 +741,6 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp index dc51be7f7b..da01c7a5be 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp @@ -273,132 +273,6 @@ void test_replaceSumInto() { test_replaceSumInto(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp index 1c0e279366..8708cf8a95 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp @@ -518,133 +518,9 @@ void test_replaceSumIntoLonger() { // FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name" #ifndef KOKKOS_ENABLE_SYCL -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif -#endif + +#include #undef EXECUTE_TEST + +#endif // KOKKOS_ENABLE_SYCL diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 01c1aad2b9..224878b290 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -269,132 +269,6 @@ void test_spadd_known_columns() { test_spadd(50, 50, 75, 100, false); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index dd22bb90dc..577c099b96 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -458,132 +458,6 @@ void test_issue402() { // test_spgemm(50000, 50000 * 30, 100, 10); // test_spgemm(50000, 50000 * 30, 200, 10); -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index 6f416e6f59..0cea5eda7c 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -266,132 +266,6 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp index 31bd4b47ec..e6036f1b32 100644 --- a/unit_test/sparse/Test_Sparse_spiluk.hpp +++ b/unit_test/sparse/Test_Sparse_spiluk.hpp @@ -305,136 +305,9 @@ void test_spiluk() { test_spiluk(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if 0 - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#endif +#define NO_TEST_COMPLEX + +#include #undef EXECUTE_TEST +#undef NO_TEST_COMPLEX diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index d8d4a7f7c5..dbc9c99998 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -1260,8 +1260,8 @@ void test_spmv_bsrmatrix_controls_pattern( // fill inputs with 1, for help debugging Kokkos::parallel_for("fill", Kokkos::MDRangePolicy>({0,0}, {hi_x.extent(0), hi_x.extent(1)}), - KOKKOS_LAMBDA (unsigned i, unsigned j) { - hi_x(i,j) = 1 + (i == 0 && j == 0); + KOKKOS_LAMBDA (unsigned i, unsigned j) { + hi_x(i,j) = 1 + (i == 0 && j == 0); } ); #endif @@ -1423,7 +1423,7 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spmv(1000, 1000 * 3, 200, 10, true); \ @@ -1607,469 +1607,42 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_ISSUE_101(TestExecSpace) #endif -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int64_t, int, TestExecSpace) -#endif +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ + EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int, size_t, LayoutLeft, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif +#undef EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, int, LayoutLeft, - TestExecSpace) -#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, int, LayoutLeft, - TestExecSpace) -#endif +# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ + EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, size_t, LayoutLeft, - TestExecSpace) -#endif +# include -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, int, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, int, LayoutLeft, - TestExecSpace) -#endif +# undef EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif #endif // defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace) -#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace) -#endif +# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace) -#endif +# include -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif +# undef EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace) -#endif +#endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif - -#undef EXECUTE_TEST +#undef EXECUTE_TEST_FN #undef EXECUTE_TEST_STRUCT #undef EXECUTE_TEST_MV #undef EXECUTE_TEST_MV_STRUCT diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index 7996e9e4e6..146ac141eb 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -469,282 +469,27 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BCRS_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testSpMVBlockCrsMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif - -#undef EXECUTE_BCRS_TIMES_VEC_TEST +#include + +#undef EXECUTE_TEST ////////////////////////// -#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBlockCrsMatrix_SpM_MV(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif - -#undef EXECUTE_BCRS_TIMES_MVEC_TEST + +#include + +#undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 6f1523f90f..1d0384e5df 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -574,281 +574,26 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BSR_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testSpMVBsrMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BSR_TIMES_VEC_TEST +#include + +#undef EXECUTE_TEST ////////////////////////// -#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBsrMatrix_SpM_MV(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif - -#undef EXECUTE_BSR_TIMES_MVEC_TEST +#include + +#undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 1be27d0c9c..0cf906133c 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -1093,132 +1093,6 @@ void test_sptrsv() { test_sptrsv(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index fce73897a8..0effe11d23 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -107,297 +107,31 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, 1000, 1000 * 20, 100, 5, 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + +# include + +# undef EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + +# include + +# undef EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_TEST_MV From 905e4ac91186a1d6e630231bb317663b646705d9 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 22 Feb 2022 15:24:48 -0700 Subject: [PATCH 010/261] Clang formatting --- .../Test_Common_Test_All_Type_Combos.hpp | 138 +++++++++--------- unit_test/sparse/Test_Sparse_BsrMatrix.hpp | 2 +- .../Test_Sparse_replaceSumIntoLonger.hpp | 2 +- unit_test/sparse/Test_Sparse_spmv.hpp | 44 +++--- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 13 +- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 10 +- unit_test/sparse/Test_Sparse_trsv.hpp | 39 +++-- 7 files changed, 123 insertions(+), 125 deletions(-) diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp index 60e0651e69..34a716929e 100644 --- a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp +++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp @@ -54,7 +54,7 @@ #error Test_Common_Test_All_Type_Combos.hpp requires EXECUTE_TEST to be set #endif -#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) // ETI is off, test all possible type combos @@ -68,7 +68,7 @@ EXECUTE_TEST(float, int64_t, int, TestExecSpace) EXECUTE_TEST(float, int, size_t, TestExecSpace) EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -# if !defined(NO_TEST_COMPLEX) +#if !defined(NO_TEST_COMPLEX) EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) @@ -79,110 +79,110 @@ EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -# endif +#endif #else // ETI is on, only test instantiated type combos -# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(double, int, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(double, int64_t, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(double, int, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(float, int, int, TestExecSpace) #endif -# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(float, int64_t, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(float, int, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -# endif +#endif -# if !defined(NO_TEST_COMPLEX) +#if !defined(NO_TEST_COMPLEX) -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -# endif +#endif -# if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -# endif +#endif -# endif // !NO_TEST_COMPLEX +#endif // !NO_TEST_COMPLEX -#endif // ETI ON +#endif // ETI ON diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp index 26748690ac..8f70e5bca3 100644 --- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp @@ -374,7 +374,7 @@ void testBsrMatrix() { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBsrMatrix(); \ diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp index 8708cf8a95..a9d8ac81b7 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp @@ -523,4 +523,4 @@ void test_replaceSumIntoLonger() { #undef EXECUTE_TEST -#endif // KOKKOS_ENABLE_SYCL +#endif // KOKKOS_ENABLE_SYCL diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index dbc9c99998..5e40c4174f 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -538,8 +538,8 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; - Kokkos::View mat_structure("Matrix Structure", - 1); + Kokkos::View mat_structure("Matrix Structure", + 1); mat_structure(0, 0) = nx; if (leftBC == 1) { mat_structure(0, 1) = 1; @@ -584,8 +584,8 @@ void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, Kokkos::View structure("Spmv Structure", 2); structure(0) = nx; structure(1) = ny; - Kokkos::View mat_structure("Matrix Structure", - 2); + Kokkos::View mat_structure("Matrix Structure", + 2); mat_structure(0, 0) = nx; if (horizontalBC == 1 || horizontalBC == 3) { mat_structure(0, 1) = 1; @@ -650,8 +650,8 @@ void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, structure(0) = nx; structure(1) = ny; structure(2) = nz; - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = nx; if (horizontal1BC == 1 || horizontal1BC == 3) { mat_structure(0, 1) = 1; @@ -720,8 +720,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; - Kokkos::View mat_structure("Matrix Structure", - 1); + Kokkos::View mat_structure("Matrix Structure", + 1); mat_structure(0, 0) = nx; mat_structure(0, 1) = 1; mat_structure(0, 2) = 1; @@ -1607,38 +1607,38 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_ISSUE_101(TestExecSpace) #endif -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) #include #undef EXECUTE_TEST -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) -# include +#include -# undef EXECUTE_TEST +#undef EXECUTE_TEST #endif // defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) -# include +#include -# undef EXECUTE_TEST +#undef EXECUTE_TEST #endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index 146ac141eb..0462a36098 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -96,8 +96,8 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -237,8 +237,8 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -469,7 +469,7 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -482,14 +482,13 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBlockCrsMatrix_SpM_MV(); \ } - #include #undef EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 1d0384e5df..c40126fa7c 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -141,8 +141,8 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -273,8 +273,8 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -574,7 +574,7 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 0effe11d23..2bd0853b73 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -28,12 +28,11 @@ void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b, // typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename scalar_view_t::value_type ScalarA; - double eps = (std::is_same::value - ? 2 * 1e-2 - : (std::is_same>::value || - std::is_same>::value) - ? 2 * 1e-1 - : 1e-7); + double eps = (std::is_same::value ? 2 * 1e-2 + : (std::is_same>::value || + std::is_same>::value) + ? 2 * 1e-1 + : 1e-7); Kokkos::fence(); KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); @@ -107,31 +106,31 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, 1000, 1000 * 20, 100, 5, 10); \ } -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) -# include +#include -# undef EXECUTE_TEST +#undef EXECUTE_TEST -#endif // KOKKOSKERNELS_INST_LAYOUTLEFT +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -# define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) -# include +#include -# undef EXECUTE_TEST +#undef EXECUTE_TEST -#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_TEST_MV From d596920a4eadebc1f32e3af343a8ef2a28c55c3d Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 22 Feb 2022 15:40:36 -0700 Subject: [PATCH 011/261] Change EXECUTE_TEST to KOKKOSKERNELS_EXECUTE_TEST --- .../Test_Common_Test_All_Type_Combos.hpp | 70 +++++++++---------- .../sparse/Test_Sparse_BlockCrsMatrix.hpp | 4 +- unit_test/sparse/Test_Sparse_BsrMatrix.hpp | 4 +- unit_test/sparse/Test_Sparse_CrsMatrix.hpp | 4 +- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 4 +- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 4 +- .../sparse/Test_Sparse_replaceSumInto.hpp | 4 +- .../Test_Sparse_replaceSumIntoLonger.hpp | 4 +- unit_test/sparse/Test_Sparse_spadd.hpp | 4 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 4 +- .../sparse/Test_Sparse_spgemm_jacobi.hpp | 4 +- unit_test/sparse/Test_Sparse_spiluk.hpp | 4 +- unit_test/sparse/Test_Sparse_spmv.hpp | 14 ++-- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 8 +-- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 8 +-- unit_test/sparse/Test_Sparse_sptrsv.hpp | 4 +- unit_test/sparse/Test_Sparse_trsv.hpp | 8 +-- 17 files changed, 78 insertions(+), 78 deletions(-) diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp index 34a716929e..4e4ba0ef34 100644 --- a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp +++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp @@ -45,13 +45,13 @@ /// \file Test_Common_Test_All_Type_Combos.hpp /** - * EXECUTE_TEST should take (SCALAR, ORDINAL, OFFSET, DEVICE). All + * KOKKOSKERNELS_EXECUTE_TEST should take (SCALAR, ORDINAL, OFFSET, DEVICE). All * these args are types. * #define NO_TEST_COMPLEX to skip testing of kokkos complex types */ -#if !defined(EXECUTE_TEST) -#error Test_Common_Test_All_Type_Combos.hpp requires EXECUTE_TEST to be set +#if !defined(KOKKOSKERNELS_EXECUTE_TEST) +#error Test_Common_Test_All_Type_Combos.hpp requires KOKKOSKERNELS_EXECUTE_TEST to be set #endif #if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ @@ -59,25 +59,25 @@ // ETI is off, test all possible type combos -EXECUTE_TEST(double, int, int, TestExecSpace) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST(float, int, int, TestExecSpace) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) #if !defined(NO_TEST_COMPLEX) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif @@ -88,49 +88,49 @@ EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_FLOAT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) #endif #if !defined(NO_TEST_COMPLEX) @@ -138,49 +138,49 @@ EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) #endif #endif // !NO_TEST_COMPLEX diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp index d7a11ac934..6eb4488c72 100644 --- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp @@ -372,7 +372,7 @@ void testBlockCrsMatrix() { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##blkcrsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -381,4 +381,4 @@ void testBlockCrsMatrix() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp index 8f70e5bca3..501ebc2ead 100644 --- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp @@ -374,7 +374,7 @@ void testBsrMatrix() { } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBsrMatrix(); \ @@ -382,4 +382,4 @@ void testBsrMatrix() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index e1600253ee..27152d76a6 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -244,7 +244,7 @@ void testCrsMatrixHostMirror() { EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##crsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testCrsMatrix(); \ @@ -258,4 +258,4 @@ void testCrsMatrixHostMirror() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index d505e05608..04f7b5eacc 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -404,7 +404,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##block_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -420,4 +420,4 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index a9fe79ad8a..7acb94ef61 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -687,7 +687,7 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { EXPECT_LT(result_norm_res, 0.25 * initial_norm_res); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -743,4 +743,4 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp index da01c7a5be..4036e7ddbd 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp @@ -266,7 +266,7 @@ void test_replaceSumInto() { EXPECT_TRUE(success); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##replaceSumInto##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -275,4 +275,4 @@ void test_replaceSumInto() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp index a9d8ac81b7..e5e1266e1d 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp @@ -509,7 +509,7 @@ void test_replaceSumIntoLonger() { EXPECT_TRUE(success); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##replaceSumIntoLonger##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -521,6 +521,6 @@ void test_replaceSumIntoLonger() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #endif // KOKKOS_ENABLE_SYCL diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 224878b290..5b4e9e47b8 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -250,7 +250,7 @@ void test_spadd_known_columns() { ASSERT_EQ(A.nnz(), C.nnz()); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##spadd_sorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -271,4 +271,4 @@ void test_spadd_known_columns() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index 577c099b96..b84ef6acc4 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -437,7 +437,7 @@ void test_issue402() { << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n"; } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spgemm(10000, 10000, 10000, \ @@ -460,4 +460,4 @@ void test_issue402() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index 0cea5eda7c..885b1a07fe 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -258,7 +258,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, EXPECT_TRUE(is_identical); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##spgemm_jacobi##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -268,4 +268,4 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp index e6036f1b32..353543b751 100644 --- a/unit_test/sparse/Test_Sparse_spiluk.hpp +++ b/unit_test/sparse/Test_Sparse_spiluk.hpp @@ -299,7 +299,7 @@ void test_spiluk() { Test::run_test_spiluk(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spiluk(); \ @@ -309,5 +309,5 @@ void test_spiluk() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #undef NO_TEST_COMPLEX diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 5e40c4174f..a5a95e14c1 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -1607,25 +1607,25 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_ISSUE_101(TestExecSpace) #endif -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #endif // defined(KOKKOSKERNELS_INST_LAYOUTLEFT) @@ -1633,12 +1633,12 @@ EXECUTE_TEST_ISSUE_101(TestExecSpace) (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index 0462a36098..c076da4015 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -469,7 +469,7 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -478,11 +478,11 @@ void testBlockCrsMatrix_SpM_MV() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -491,4 +491,4 @@ void testBlockCrsMatrix_SpM_MV() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index c40126fa7c..4399bcd58b 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -574,7 +574,7 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -583,11 +583,11 @@ void testBsrMatrix_SpM_MV() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST ////////////////////////// -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -596,4 +596,4 @@ void testBsrMatrix_SpM_MV() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 0cf906133c..0b175da13d 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -1087,7 +1087,7 @@ void test_sptrsv() { // Test::run_test_sptrsv_mtx(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_sptrsv(); \ @@ -1095,4 +1095,4 @@ void test_sptrsv() { #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 2bd0853b73..8adcfb6821 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -110,12 +110,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #endif // KOKKOSKERNELS_INST_LAYOUTLEFT @@ -123,12 +123,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) #include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST #endif // KOKKOSKERNELS_INST_LAYOUTRIGHT From a97e7992d863fcbd07dd8332e1974a6e5400bdb9 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 22 Feb 2022 16:50:25 -0700 Subject: [PATCH 012/261] Another attempt at clang-format-8 --- .../Test_Common_Test_All_Type_Combos.hpp | 6 ++++-- unit_test/sparse/Test_Sparse_spadd.hpp | 2 +- unit_test/sparse/Test_Sparse_spmv.hpp | 20 +++++++++---------- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 8 ++++---- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 8 ++++---- unit_test/sparse/Test_Sparse_trsv.hpp | 11 +++++----- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp index 4e4ba0ef34..afacb09ee9 100644 --- a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp +++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp @@ -73,7 +73,8 @@ KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) @@ -156,7 +157,8 @@ KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) -KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 5b4e9e47b8..881f891837 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -250,7 +250,7 @@ void test_spadd_known_columns() { ASSERT_EQ(A.nnz(), C.nnz()); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##spadd_sorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index f8cdefef4c..3cbe3d401d 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -538,8 +538,8 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; - Kokkos::View mat_structure("Matrix Structure", - 1); + Kokkos::View mat_structure("Matrix Structure", + 1); mat_structure(0, 0) = nx; if (leftBC == 1) { mat_structure(0, 1) = 1; @@ -584,8 +584,8 @@ void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, Kokkos::View structure("Spmv Structure", 2); structure(0) = nx; structure(1) = ny; - Kokkos::View mat_structure("Matrix Structure", - 2); + Kokkos::View mat_structure("Matrix Structure", + 2); mat_structure(0, 0) = nx; if (horizontalBC == 1 || horizontalBC == 3) { mat_structure(0, 1) = 1; @@ -650,8 +650,8 @@ void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, structure(0) = nx; structure(1) = ny; structure(2) = nz; - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = nx; if (horizontal1BC == 1 || horizontal1BC == 3) { mat_structure(0, 1) = 1; @@ -720,8 +720,8 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; - Kokkos::View mat_structure("Matrix Structure", - 1); + Kokkos::View mat_structure("Matrix Structure", + 1); mat_structure(0, 0) = nx; mat_structure(0, 1) = 1; mat_structure(0, 2) = 1; @@ -1612,8 +1612,8 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_ISSUE_101(TestExecSpace) #endif -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) #include diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index 8adb06300c..b3bbe25718 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -96,8 +96,8 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -237,8 +237,8 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index bca6d2ddf6..b8cd411154 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -141,8 +141,8 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right @@ -273,8 +273,8 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization // on a cartesian grid. - Kokkos::View mat_structure("Matrix Structure", - 3); + Kokkos::View mat_structure("Matrix Structure", + 3); mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction mat_structure(0, 1) = 0; // Add BC to the left mat_structure(0, 2) = 0; // Add BC to the right diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 8adcfb6821..4b1f00c98a 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -28,11 +28,12 @@ void check_trsv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type b, // typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename scalar_view_t::value_type ScalarA; - double eps = (std::is_same::value ? 2 * 1e-2 - : (std::is_same>::value || - std::is_same>::value) - ? 2 * 1e-1 - : 1e-7); + double eps = (std::is_same::value + ? 2 * 1e-2 + : (std::is_same>::value || + std::is_same>::value) + ? 2 * 1e-1 + : 1e-7); Kokkos::fence(); KokkosSparse::trsv(uplo, trans, "N", input_mat, b, x); From 0e4d10dc717798fa4a16dd32d2e7c8142c84342f Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Mon, 17 Jan 2022 15:02:14 -0500 Subject: [PATCH 013/261] KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA is always defined --- perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp | 2 -- perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp | 2 -- 2 files changed, 4 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index a8b3de209b..7e4dd8fa2d 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,13 +3,11 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT #endif #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index fb9cd6297d..abc96148b1 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -3,11 +3,9 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI) From 368d5f2c370d716e4177c060e2fbe46e0941634b Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 18 Jan 2022 11:27:36 -0500 Subject: [PATCH 014/261] Enable perf test for non-CUDA builds --- perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp | 4 +--- perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index 7e4dd8fa2d..e888609f14 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,11 +3,9 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index abc96148b1..cf857c6779 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) +#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI #endif From 2bd1b217c5ce3188415baffa7c5055ef6bed53c9 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 18 Jan 2022 11:32:17 -0500 Subject: [PATCH 015/261] Template perf traits on execution space to avoid using Kokkos::Impl::ActiveExecutionMemorySpace --- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 69 +++++++++++++---- .../KokkosBatched_Test_BlockTridiagJacobi.cpp | 74 +++++++++++++++---- 2 files changed, 116 insertions(+), 27 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index e888609f14..d6abdb4d62 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -71,38 +71,82 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct FactorizeModeAndAlgo; -template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +struct FactorizeModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct FactorizeModeAndAlgo + : FactorizeModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { @@ -272,8 +316,7 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef FactorizeModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -355,7 +398,7 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index cf857c6779..8513cad752 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -73,38 +73,86 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct InverseDiagonalsModeAndAlgo; -template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_ONPENMP) template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +struct InverseDiagonalsModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { @@ -280,8 +328,7 @@ int main(int argc, char *argv[]) { policy.set_scratch_size( 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -363,8 +410,7 @@ int main(int argc, char *argv[]) { 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; From 9d48485e646ebfc048fd243f17c2769aae53e7aa Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 16 Feb 2022 12:56:17 -0700 Subject: [PATCH 016/261] perf_test/batched: Remove lambda from BlockJacobi --- ...okkosBatched_Test_BlockJacobi_Tutorial.cpp | 237 ++++++++++++------ 1 file changed, 157 insertions(+), 80 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index f3237d9b4f..94f58fba83 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -3,16 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKJACOBI -#endif -#endif -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" @@ -79,6 +69,152 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, return residual; } +namespace ConstructBlockJacobi { +template +struct Task1Factorize { + private: + VT __A; + + public: + Task1Factorize(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, AA); + } +}; + +template +struct Task1SetIdentity { + private: + VT __A; + + public: + Task1SetIdentity(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamSetIdentity::invoke(member, AA); + } +}; + +template +struct Task1SolveLowerTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveLowerTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, AA); + } +}; + +template +struct Task1SolveUpperTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveUpperTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; +} // namespace ConstructBlockJacobi + +template +struct Task1ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task1ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + +template +struct Task2FactorizeInvert { + private: + VTA __A; + VTT __T; + + public: + Task2FactorizeInvert(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const val_type one(1); + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + + TeamLU::invoke(member, AA); + TeamCopy::invoke(member, AA, TT); + TeamSetIdentity::invoke(member, AA); + TeamTrsm::invoke(member, one, TT, AA); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; + +template +struct Task2ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task2ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -159,44 +295,21 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task1.factorize", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, AA); - }); + ConstructBlockJacobi::Task1Factorize(A)); Kokkos::deep_copy(T, A); Kokkos::parallel_for( "task1.set-identity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamSetIdentity::invoke(member, AA); - }); + ConstructBlockJacobi::Task1SetIdentity(A)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-lower-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, - TT, AA); - }); + ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-upper-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -211,16 +324,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task1.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task1ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 1: application of jacobi time = %f , # of applications per " @@ -256,23 +361,7 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task2.factorize-invert", policy, - KOKKOS_LAMBDA(const member_type &member) { - const val_type one(1); - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - - TeamLU::invoke(member, AA); - TeamCopy::invoke(member, AA, TT); - TeamSetIdentity::invoke(member, AA); - TeamTrsm::invoke(member, one, - TT, AA); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + Task2FactorizeInvert(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -287,16 +376,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task2.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task2ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 2: application of jacobi time = %f , # of applications per " @@ -318,7 +399,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif From 9ab0ecf790c1c6242263e8e5cb670e337bd4e576 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 16 Feb 2022 13:12:27 -0700 Subject: [PATCH 017/261] perf_test/batched: Remove lambda from BlockTridiagDirect --- .../KokkosBatched_Test_BlockTridiagDirect.cpp | 212 +++++++++--------- 1 file changed, 107 insertions(+), 105 deletions(-) diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index d6abdb4d62..ffa6efec5e 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,12 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) -#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -43,11 +37,13 @@ #define KOKKOSBATCHED_USE_128BIT_MEMORY_INST -typedef Kokkos::DefaultExecutionSpace exec_space; -typedef typename exec_space::memory_space memory_space; -typedef Kokkos::DefaultHostExecutionSpace host_space; +using exec_space_type = Kokkos::DefaultExecutionSpace; +using memory_space_type = exec_space_type::memory_space; +using host_space_type = Kokkos::DefaultHostExecutionSpace; -typedef double value_type; +using value_type = double; +using policy_type = Kokkos::TeamPolicy; +using member_type = typename policy_type::member_type; /// 128*128*128/16*5 * (2*8) / 16 /// @@ -56,10 +52,10 @@ typedef double value_type; using namespace KokkosBatched; static constexpr int vector_length = - DefaultVectorLength::value; + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -149,6 +145,83 @@ struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; #endif +template +struct SetTridiagToIdentity { + private: + VT __AA; + + public: + SetTridiagToIdentity(VT AA) : __AA(AA) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), + [&](const int &v) { + for (int k = 0, kend = __AA.extent(3); k < kend; ++k) + __AA(i, j, 1, k, k, v) = 1; + }); + }); + } +}; + +template +struct Factorize { + private: + VT __AA; + LT __L; + + public: + Factorize(VT AA, LT L) : __AA(AA), __L(L) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + typedef FactorizeModeAndAlgo + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (__L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (__L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke(member, 1.0, A, B); + Trsm::invoke(member, 1.0, A, + C); + Gemm::invoke(member, -1.0, C, B, 1.0, D); + } + LU::invoke(member, D); + } + }); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -189,53 +262,56 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( + Kokkos::View Av( "A", N / vector_length, L, 3, Blk, Blk); /// double - Kokkos::View As( + Kokkos::View As( (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View + Kokkos::View Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( + Kokkos::View xv( "x", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View xs( + Kokkos::View xs( (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( + Kokkos::View bv( "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs( + Kokkos::View bs( (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy( + Kokkos::View Acopy( "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( + Kokkos::View rs( "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), bs.extent(4)); @@ -257,24 +333,9 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for( - "setTridiagToIdentity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, AA.extent(1)), - [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - for (int k = 0, kend = AA.extent(3); k < kend; ++k) - AA(i, j, 1, k, k, v) = 1; - }); - }); - }); + Kokkos::parallel_for("setTridiagToIdentity", policy, + SetTridiagToIdentity(AA)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -286,7 +347,7 @@ int main(int argc, char *argv[]) { /// randomize input { const value_type one(1); - Kokkos::Random_XorShift64_Pool random(13245); + Kokkos::Random_XorShift64_Pool random(13245); Kokkos::fill_random(As, random, one); Kokkos::fill_random(bs, random, one); @@ -301,9 +362,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -313,58 +372,9 @@ int main(int argc, char *argv[]) { } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for( - "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo - default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank(); - - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto AAA = - Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = - Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = - Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (L == 1) { - A.assign_data(&AAA(0, 1, 0, 0)); - LU::invoke(member, A); - } else { - for (int k = 0; k < (L - 1); ++k) { - A.assign_data(&AAA(k, 1, 0, 0)); - B.assign_data(&AAA(k, 2, 0, 0)); - C.assign_data(&AAA(k, 0, 0, 0)); - D.assign_data(&AAA(k + 1, 1, 0, 0)); - - LU::invoke(member, A); - Trsm::invoke(member, 1.0, A, B); - Trsm::invoke(member, 1.0, A, C); - Gemm::invoke(member, -1.0, C, B, - 1.0, D); - } - LU::invoke(member, D); - } - }); - }); + Kokkos::parallel_for("factorize", + policy.set_scratch_size(0, Kokkos::PerTeam(S)), + Factorize(AA, L)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -382,9 +392,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -527,8 +535,6 @@ int main(int argc, char *argv[]) { /// if (1) { typedef KokkosBatched::Algo::Level2::Unblocked algo_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); Kokkos::parallel_for( "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { @@ -678,7 +684,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif From 49eb4ddbf11f622ea496f1f6a5e516d27ae658aa Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Mon, 21 Feb 2022 20:45:29 -0700 Subject: [PATCH 018/261] Add unit test for BsrMatrix and BlockCrsMatrix spmv --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 4 +- src/sparse/KokkosSparse_spmv.hpp | 8 +- .../KokkosSparse_spmv_blockcrsmatrix_spec.hpp | 4 +- unit_test/sparse/Test_Sparse.hpp | 4 +- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 388 ++++++++++++++---- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 359 +++++++++++++--- 6 files changed, 619 insertions(+), 148 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index a1ae213ea9..a6eec44449 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -503,7 +503,7 @@ void spmv_block_impl_cusparse( default: { std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; throw std::invalid_argument("Invalid mode"); - } break; + } } #if (9000 <= CUDA_VERSION) @@ -599,7 +599,7 @@ void spm_mv_block_impl_cusparse( default: { std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; throw std::invalid_argument("Invalid mode"); - } break; + } } int colx = static_cast(x.extent(1)); diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 8ec7799e16..52c9b4e0bf 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -1072,12 +1072,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], } // return Experimental::Impl::SPMV_MV_BLOCKCRSMATRIX< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::const_value_type, + typename AMatrix_Internal::const_ordinal_type, typename AMatrix_Internal::device_type, typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, + typename AMatrix_Internal::const_size_type, + typename XVector_Internal::const_value_type**, typename XVector_Internal::array_layout, typename XVector_Internal::device_type, typename XVector_Internal::memory_traits, diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp index 7132ec0fe1..14b75f1c39 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -101,10 +101,10 @@ struct spmv_mv_blockcrsmatrix_eti_spec_avail { const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits > { \ enum : bool { value = true }; \ diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 2afa0fb2db..30639512c5 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -14,8 +14,8 @@ #include "Test_Sparse_spgemm.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" -//#include "Test_Sparse_spmv_blockcrs.hpp" -//#include "Test_Sparse_spmv_bsr.hpp" +#include "Test_Sparse_spmv_blockcrs.hpp" +#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index b3bbe25718..c30923a5bf 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -42,6 +42,7 @@ //@HEADER */ +#include #include #include #include @@ -128,36 +129,44 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - // Fill block with random values - std::vector mat_val(nnz); - for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); - // // Create graph for CrsMatrix // - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); + + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]); for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const lno_t my_row = ir * blockSize + ib; + h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize; + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { - mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = col0 * blockSize + jb; } } } } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); + // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); x_vector_type xref("new_right_hand_side", nRow); auto h_xref = Kokkos::create_mirror_view(xref); @@ -179,7 +188,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // Compute the reference product KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - y_vector_type ybcrs("bsr_product_result", nRow); + y_vector_type ybcrs("bcrs_product_result", nRow); auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir) = h_y0(ir); Kokkos::deep_copy(ybcrs, h_ybcrs); @@ -187,26 +196,27 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // Create the BlockCrsMatrix KokkosSparse::Experimental::BlockCrsMatrix - Absr(Acrs, blockSize); + Abcrs(Acrs, blockSize); // Compute the product with the BlockCrsMatrix format - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs); // Compare the two products - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybcrs, ybcrs); for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max( - error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybcrs(ir))); - maxNorm = - std::max(maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybcrs(ir))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; @@ -216,9 +226,8 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row // - const auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -231,7 +240,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, /// \brief Driver routine for checking BlockCrsMatrix times multiple vector template + typename layout, typename device> void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, const lno_t bMax, int &num_errors) { // The mat_structure view is used to generate a matrix using @@ -255,7 +264,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View block_vector_t; + typedef Kokkos::View block_vector_t; h_crsMat_t mat_b1 = Test::generate_structured_matrix3D("FD", mat_structure); @@ -273,41 +282,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_val(nnz); - for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - // - // Create graph for CrsMatrix - // + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); - std::vector mat_rowmap(nRow + 1); - std::vector mat_colidx(nnz); + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); - mat_rowmap.resize(nRow + 1); - auto *rowmap = &mat_rowmap[0]; - rowmap[0] = 0; - - mat_colidx.resize(nnz); - auto *cols = &mat_colidx[0]; + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]); for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - rowmap[my_row + 1] = rowmap[my_row] + (jend - jbeg) * blockSize; - for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const lno_t my_row = ir * blockSize + ib; + h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize; + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { - cols[rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = col0 * blockSize + jb; } } } } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); + // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], rowmap, cols); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); block_vector_t xref("new_right_hand_side", nRow, nrhs); auto h_xref = Kokkos::create_mirror_view(xref); @@ -329,7 +337,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - block_vector_t ybcrs("bsr_product_result", nRow, nrhs); + block_vector_t ybcrs("bcrs_product_result", nRow, nrhs); auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); for (int jc = 0; jc < nrhs; ++jc) for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir, jc) = h_y0(ir, jc); @@ -338,38 +346,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // Create the BlockCrsMatrix KokkosSparse::Experimental::BlockCrsMatrix - Absr(Acrs, blockSize); + Abcrs(Acrs, blockSize); // Compute the product for the BlockCrsMatrix format - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs); Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybcrs, ybcrs); // Compare the two products - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + for (int jc = 0; jc < nrhs; ++jc) { for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, Kokkos::ArithTraits::abs( - h_ycrs(ir, jc) - h_ybcrs(ir, jc))); - maxNorm = std::max(maxNorm, - Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + error = std::max(error, + KATS::abs(h_ycrs(ir, jc) - h_ybcrs(ir, jc))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); } } - auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); - - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + + const mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; num_errors += 1; } + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); + if (error > tol * maxNorm) { std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -425,7 +435,7 @@ void testSpMVBlockCrsMatrix() { } template + typename layout, typename device> void testBlockCrsMatrix_SpM_MV() { // // Test for the operation Y <- alpha * Op(A) * X + beta * Y @@ -452,7 +462,7 @@ void testBlockCrsMatrix_SpM_MV() { auto alpha_s = static_cast(testAlphaBeta[ii]); auto beta_s = static_cast(testAlphaBeta[ii + 1]); num_errors = 0; - Test_BlockCrs::check_blockcrs_times_mv(&mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { @@ -482,13 +492,237 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testBlockCrsMatrix_SpM_MV(); \ +#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + testBlockCrsMatrix_SpM_MV(); \ } -#include - -#undef KOKKOSKERNELS_EXECUTE_TEST +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, + LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#undef EXECUTE_BCRS_TIMES_MVEC_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index b8cd411154..25b44b4e7e 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -42,6 +42,7 @@ //@HEADER */ +#include #include #include #include @@ -96,33 +97,29 @@ inline void set_random_value(std::complex &v) { /// \param mat_rowmap[out] CRS-style row map for the block matrix /// \param mat_colidx[out] CRS-style column entries for the block matrix /// \param mat_val[out] Numerical (random) values -template +template void make_block_entries( const KokkosSparse::CrsMatrix &mat_b1, - int blockSize, std::vector &mat_rowmap, - std::vector &mat_colidx, std::vector &mat_val) { - lno_t nRow = blockSize * mat_b1.numRows(); + int blockSize, rowmap_type &mat_rowmap, colidx_type &mat_colidx, + values_type &mat_val) { size_t nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - mat_val.resize(nnz); for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); // // Create graph for CrsMatrix // - mat_rowmap.assign(nRow + 1, 0); - mat_colidx.assign(nnz, 0); - for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { const lno_t my_row = ir * blockSize + ib; mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (auto ijk = jbeg; ijk < jend; ++ijk) { + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = @@ -177,17 +174,26 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); - std::vector mat_val(nnz); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); // Create the entries - make_block_entries(mat_b1, blockSize, mat_rowmap, - mat_colidx, mat_val); + make_block_entries(mat_b1, blockSize, h_rowmap, + h_colidx, h_matval); + + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); x_vector_type xref("new_right_hand_side", nRow); auto h_xref = Kokkos::create_mirror_view(xref); @@ -229,20 +235,21 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // Compare the two products // - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybsr, ybsr); for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max( - error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybsr(ir))); - maxNorm = - std::max(maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybsr(ir))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; @@ -252,9 +259,8 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row // - const auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm @@ -267,7 +273,7 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, /// \brief Driver routine for checking BsrMatrix times multiple vector template + typename layout, typename device> void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, const lno_t bMax, int &num_errors) { // The mat_structure view is used to generate a matrix using @@ -291,7 +297,7 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View block_vector_t; + typedef Kokkos::View block_vector_t; h_crsMat_t mat_b1 = Test::generate_structured_matrix3D("FD", mat_structure); @@ -309,17 +315,26 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); - std::vector mat_val(nnz); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); // Create the entries - make_block_entries(mat_b1, static_cast(blockSize), - mat_rowmap, mat_colidx, mat_val); + make_block_entries(mat_b1, blockSize, h_rowmap, + h_colidx, h_matval); + + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); block_vector_t xref("new_right_hand_side", nRow, nrhs); auto h_xref = Kokkos::create_mirror_view(xref); @@ -366,29 +381,29 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // // Compare the two products // - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; for (int jc = 0; jc < nrhs; ++jc) { for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, Kokkos::ArithTraits::abs( - h_ycrs(ir, jc) - h_ybsr(ir, jc))); - maxNorm = std::max(maxNorm, - Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + error = std::max(error, + KATS::abs(h_ycrs(ir, jc) - h_ybsr(ir, jc))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); } } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; num_errors += 1; } - auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -531,7 +546,7 @@ void testSpMVBsrMatrix() { } template + typename layout, typename device> void testBsrMatrix_SpM_MV() { // // Test for the operation Y <- alpha * Op(A) * X + beta * Y @@ -558,7 +573,7 @@ void testBsrMatrix_SpM_MV() { auto alpha_s = static_cast(testAlphaBeta[ii]); auto beta_s = static_cast(testAlphaBeta[ii + 1]); num_errors = 0; - Test_Bsr::check_bsrm_times_mv( + Test_Bsr::check_bsrm_times_mv( &mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { printf( @@ -587,13 +602,235 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testBsrMatrix_SpM_MV(); \ +#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + testBsrMatrix_SpM_MV(); \ } -#include - -#undef KOKKOSKERNELS_EXECUTE_TEST +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutLeft, TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutRight, TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutLeft, + TestExecSpace) +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutRight, + TestExecSpace) +#endif +#endif + +#undef EXECUTE_BSR_TIMES_MVEC_TEST From eef432e0541bb22954f842aeb791ee76b1779437 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Thu, 24 Feb 2022 20:40:14 -0700 Subject: [PATCH 019/261] Add barrier --- src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 69a95f6f9e..cc8551638f 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -969,6 +969,8 @@ struct BSR_GEMV_Transpose_Functor { Kokkos::atomic_add(&Y_cur(ijk), shared_view(ijk)); }); + // + dev.team_barrier(); } } else { for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { @@ -998,6 +1000,8 @@ struct BSR_GEMV_Transpose_Functor { [&](const ordinal_type &ijk) { Kokkos::atomic_add(&Y_cur(ijk), shared_y[ijk]); }); + // + dev.team_barrier(); } } } From 5c419f19a7cfac19855964da8d8518247200d7fb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 28 Feb 2022 14:23:29 -0700 Subject: [PATCH 020/261] perf_test/blas: Check ARMPL build version --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index aa78e0bf97..b9cff5e5e4 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1883,7 +1883,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, // Check the result if (gemm_args.C.data() != nullptr) { -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { using view_type_2d = Kokkos::View; @@ -1908,7 +1908,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, } } } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058 if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Result value mismatch!"); } @@ -2078,7 +2078,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { Kokkos::fence(); } -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { armpl_int_t bstrd_A, istrd_A, jstrd_A, bstrd_B, istrd_B, jstrd_B, bstrd_C, istrd_C, jstrd_C; @@ -2168,7 +2168,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.B_pl.mat = B_p; gemm_args.C_pl.mat = C_p; } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058 gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; From ef9f08b5029008bbef46ebf3b2473f5311598697 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Mar 2022 13:37:25 -0700 Subject: [PATCH 021/261] Restore BLAS-1 MV paths for 1 column Also: test these paths, test nrm2w, and use 3-arg (async) deep copies in the >1 column paths of these kernels. --- src/blas/impl/KokkosBlas1_dot_mv_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_dot_spec.hpp | 45 +++- src/blas/impl/KokkosBlas1_nrm1_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm1_spec.hpp | 21 +- src/blas/impl/KokkosBlas1_nrm2_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm2_spec.hpp | 22 +- src/blas/impl/KokkosBlas1_nrm2w_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_nrm2w_spec.hpp | 23 +- src/blas/impl/KokkosBlas1_sum_impl.hpp | 5 +- src/blas/impl/KokkosBlas1_sum_spec.hpp | 21 +- unit_test/blas/Test_Blas.hpp | 1 + unit_test/blas/Test_Blas1_dot.hpp | 3 + unit_test/blas/Test_Blas1_nrm1.hpp | 3 + unit_test/blas/Test_Blas1_nrm2.hpp | 3 + unit_test/blas/Test_Blas1_nrm2_squared.hpp | 3 + unit_test/blas/Test_Blas1_nrm2w.hpp | 234 +++++++++++++++++++++ unit_test/blas/Test_Blas1_sum.hpp | 3 + 17 files changed, 370 insertions(+), 37 deletions(-) create mode 100644 unit_test/blas/Test_Blas1_nrm2w.hpp diff --git a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp index 500dc035ca..dfbae10a99 100644 --- a/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_mv_impl.hpp @@ -131,7 +131,8 @@ void MV_Dot_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerDot; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -156,7 +157,7 @@ void MV_Dot_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Dot_MV temp result"), r.extent(0)); MV_Dot_Invoke(tempResult, x, y); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_dot_spec.hpp b/src/blas/impl/KokkosBlas1_dot_spec.hpp index 350934230d..33c7603057 100644 --- a/src/blas/impl/KokkosBlas1_dot_spec.hpp +++ b/src/blas/impl/KokkosBlas1_dot_spec.hpp @@ -377,6 +377,20 @@ struct Dot + static auto getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return Kokkos::subview(v, Kokkos::ALL(), 0); + } + + template + static V getFirstColumn( + const V& v, typename std::enable_if::type* = nullptr) { + return v; + } + static void dot(const RV& R, const XV& X, const YV& Y) { Kokkos::Profiling::pushRegion(KOKKOSKERNELS_IMPL_COMPILE_LIBRARY ? "KokkosBlas::dot[ETI]" @@ -392,14 +406,31 @@ struct Dot(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - typedef int index_type; - MV_Dot_Invoke(R, X, Y); + const size_type numDots = std::max(X.extent(1), Y.extent(1)); + if (numDots == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = getFirstColumn(X); + auto Y0 = getFirstColumn(Y); + if (numRows < static_cast(INT_MAX)) { + typedef int index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } else { + typedef int64_t index_type; + DotFunctor f(X0, + Y0); + f.run("KokkosBlas::dot<1D>", R0); + } } else { - typedef std::int64_t index_type; - MV_Dot_Invoke(R, X, Y); + if (numRows < static_cast(INT_MAX) && + numRows * numDots < static_cast(INT_MAX)) { + typedef int index_type; + MV_Dot_Invoke(R, X, Y); + } else { + typedef std::int64_t index_type; + MV_Dot_Invoke(R, X, Y); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 07422035b7..2002ef2c39 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -170,7 +170,8 @@ void MV_Nrm1_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -195,7 +196,7 @@ void MV_Nrm1_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm1 temp result"), r.extent(0)); MV_Nrm1_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp index df86d00fa2..478395d7a9 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_spec.hpp @@ -200,12 +200,23 @@ struct Nrm1 { : "KokkosBlas::nrm1[noETI]"); const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm1_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm1_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Nrm1_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Nrm1_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm1_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Nrm1_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index 4efc0e6c6d..f2b0e826bc 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -200,7 +200,8 @@ void MV_Nrm2_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Nrm2 temp result"), r.extent(0)); MV_Nrm2_Invoke(tempResult, x, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp index 340d78fdf1..71afb2ede3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_spec.hpp @@ -200,12 +200,24 @@ struct Nrm2 { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2_Invoke(R0, X0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2_Invoke(R0, X0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2_Invoke(R, X, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2_Invoke(R, X, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2_Invoke(R, X, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 3013fd17f8..3f202ca430 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -199,7 +199,8 @@ void MV_Nrm2w_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -230,7 +231,7 @@ void MV_Nrm2w_Invoke( r.extent(0)); MV_Nrm2w_Invoke(tempResult, x, w, take_sqrt); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp index fe437bbc5c..28162bce5f 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_spec.hpp @@ -201,12 +201,25 @@ struct Nrm2w { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numCols == 1) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto W0 = Kokkos::subview(W, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Nrm2w_Invoke(R0, X0, W0, take_sqrt); + } else { + typedef std::int64_t index_type; + V_Nrm2w_Invoke(R0, X0, W0, + take_sqrt); + } } else { - typedef std::int64_t index_type; - MV_Nrm2w_Invoke(R, X, W, take_sqrt); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } else { + typedef std::int64_t index_type; + MV_Nrm2w_Invoke(R, X, W, take_sqrt); + } } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas1_sum_impl.hpp b/src/blas/impl/KokkosBlas1_sum_impl.hpp index 05cede0f0d..b87f2e1092 100644 --- a/src/blas/impl/KokkosBlas1_sum_impl.hpp +++ b/src/blas/impl/KokkosBlas1_sum_impl.hpp @@ -162,7 +162,8 @@ void MV_Sum_Invoke( } // Zero out the result vector Kokkos::deep_copy( - r, Kokkos::ArithTraits::zero()); + execution_space(), r, + Kokkos::ArithTraits::zero()); size_type teamsPerVec; KokkosBlas::Impl::multipleReductionWorkDistribution( @@ -187,7 +188,7 @@ void MV_Sum_Invoke( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Sum temp result"), r.extent(0)); MV_Sum_Invoke(tempResult, x); - Kokkos::deep_copy(r, tempResult); + Kokkos::deep_copy(typename XV::execution_space(), r, tempResult); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_sum_spec.hpp b/src/blas/impl/KokkosBlas1_sum_spec.hpp index 505296cab9..09c34299c7 100644 --- a/src/blas/impl/KokkosBlas1_sum_spec.hpp +++ b/src/blas/impl/KokkosBlas1_sum_spec.hpp @@ -197,12 +197,23 @@ struct Sum { const size_type numRows = X.extent(0); const size_type numCols = X.extent(1); - if (numRows < static_cast(INT_MAX) && - numRows * numCols < static_cast(INT_MAX)) { - MV_Sum_Invoke(R, X); + if (numCols == Kokkos::ArithTraits::one()) { + auto R0 = Kokkos::subview(R, 0); + auto X0 = Kokkos::subview(X, Kokkos::ALL(), 0); + if (numRows < static_cast(INT_MAX)) { + V_Sum_Invoke(R0, X0); + } else { + typedef std::int64_t index_type; + V_Sum_Invoke(R0, X0); + } } else { - typedef std::int64_t index_type; - MV_Sum_Invoke(R, X); + if (numRows < static_cast(INT_MAX) && + numRows * numCols < static_cast(INT_MAX)) { + MV_Sum_Invoke(R, X); + } else { + typedef std::int64_t index_type; + MV_Sum_Invoke(R, X); + } } Kokkos::Profiling::popRegion(); } diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 642a0bf5f0..5244c35e53 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -15,6 +15,7 @@ #include "Test_Blas1_nrm1.hpp" #include "Test_Blas1_nrm2_squared.hpp" #include "Test_Blas1_nrm2.hpp" +#include "Test_Blas1_nrm2w.hpp" #include "Test_Blas1_nrminf.hpp" #include "Test_Blas1_reciprocal.hpp" #include "Test_Blas1_scal.hpp" diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 920ac06c77..536e58486c 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -196,6 +196,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -207,6 +208,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -218,6 +220,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index 72861bf5a3..c68492b6dd 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -149,6 +149,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -159,6 +160,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif @@ -169,6 +171,7 @@ int test_nrm1_mv() { Test::impl_test_nrm1_mv(0, 5); Test::impl_test_nrm1_mv(13, 5); Test::impl_test_nrm1_mv(1024, 5); + Test::impl_test_nrm1_mv(789, 1); Test::impl_test_nrm1_mv(132231, 5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 94d5414e15..688035f842 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -144,6 +144,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -154,6 +155,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif @@ -164,6 +166,7 @@ int test_nrm2_mv() { Test::impl_test_nrm2_mv(0, 5); Test::impl_test_nrm2_mv(13, 5); Test::impl_test_nrm2_mv(1024, 5); + Test::impl_test_nrm2_mv(789, 1); // Test::impl_test_nrm2_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ca357acdb2..317b9b543b 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -160,6 +160,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -170,6 +171,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif @@ -180,6 +182,7 @@ int test_nrm2_squared_mv() { Test::impl_test_nrm2_squared_mv(0, 5); Test::impl_test_nrm2_squared_mv(13, 5); Test::impl_test_nrm2_squared_mv(1024, 5); + Test::impl_test_nrm2_squared_mv(789, 1); // Test::impl_test_nrm2_squared_mv(132231,5); #endif diff --git a/unit_test/blas/Test_Blas1_nrm2w.hpp b/unit_test/blas/Test_Blas1_nrm2w.hpp new file mode 100644 index 0000000000..cda59c83e4 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w.hpp @@ -0,0 +1,234 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + expected_result = + Kokkos::ArithTraits::sqrt(expected_result); + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + expected_result[j] = + Kokkos::ArithTraits::sqrt(expected_result[j]); + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w(0); + Test::impl_test_nrm2w(13); + Test::impl_test_nrm2w(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_mv(0, 5); + Test::impl_test_nrm2w_mv(13, 5); + Test::impl_test_nrm2w_mv(1024, 5); + Test::impl_test_nrm2w_mv(789, 1); + // Test::impl_test_nrm2w_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_float"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_float"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_double"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_double"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_complex_double"); + test_nrm2w, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_complex_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_complex_double"); + test_nrm2w_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_int"); + test_nrm2w(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_mv_int"); + test_nrm2w_mv(); + Kokkos::Profiling::popRegion(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index 768091885c..2b7f51370e 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -133,6 +133,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -143,6 +144,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif @@ -153,6 +155,7 @@ int test_sum_mv() { Test::impl_test_sum_mv(0, 5); Test::impl_test_sum_mv(13, 5); Test::impl_test_sum_mv(1024, 5); + Test::impl_test_sum_mv(789, 1); // Test::impl_test_sum_mv(132231,5); #endif From 89111309f691fdd7783c283ca8ac5dbaa1d4fa1d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 1 Mar 2022 16:35:16 -0700 Subject: [PATCH 022/261] Fix types in test --- unit_test/blas/Test_Blas1_dot.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index 536e58486c..b2e3f95628 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -208,7 +208,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(789, 1); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif @@ -220,7 +220,7 @@ int test_dot_mv() { Test::impl_test_dot_mv(0, 5); Test::impl_test_dot_mv(13, 5); Test::impl_test_dot_mv(1024, 5); - Test::impl_test_dot_mv(789, 1); + Test::impl_test_dot_mv(789, 1); // Test::impl_test_dot_mv(132231,5); #endif From 1f7a45e00f5be82c87ff74bf14b7d217b37c985b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 2 Mar 2022 11:59:26 -0700 Subject: [PATCH 023/261] Fix nrm2w unification layer, add nrm2w_squared test --- src/blas/KokkosBlas1_nrm2w.hpp | 26 ++- src/blas/KokkosBlas1_nrm2w_squared.hpp | 26 ++- unit_test/blas/Test_Blas.hpp | 1 + unit_test/blas/Test_Blas1_nrm2w_squared.hpp | 232 ++++++++++++++++++++ 4 files changed, 261 insertions(+), 24 deletions(-) create mode 100644 unit_test/blas/Test_Blas1_nrm2w_squared.hpp diff --git a/src/blas/KokkosBlas1_nrm2w.hpp b/src/blas/KokkosBlas1_nrm2w.hpp index 981897d9ae..43d32e7812 100644 --- a/src/blas/KokkosBlas1_nrm2w.hpp +++ b/src/blas/KokkosBlas1_nrm2w.hpp @@ -76,7 +76,8 @@ nrm2w(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -134,20 +135,21 @@ void nrm2w(const RV& R, const XMV& X, const XMV& W, KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/src/blas/KokkosBlas1_nrm2w_squared.hpp b/src/blas/KokkosBlas1_nrm2w_squared.hpp index 2ab07af0c5..6aec955de2 100644 --- a/src/blas/KokkosBlas1_nrm2w_squared.hpp +++ b/src/blas/KokkosBlas1_nrm2w_squared.hpp @@ -77,7 +77,8 @@ nrm2w_squared(const XVector& x, const XVector& w) { typename XVector::device_type, Kokkos::MemoryTraits > XVector_Internal; - typedef Kokkos::View > RVector_Internal; @@ -135,20 +136,21 @@ void nrm2w_squared( KokkosKernels::Impl::throw_runtime_exception(os.str()); } + using UnifiedXLayout = + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout; + using UnifiedRVLayout = + typename KokkosKernels::Impl::GetUnifiedLayoutPreferring< + RV, UnifiedXLayout>::array_layout; + // Create unmanaged versions of the input Views. RV and XMV may be // rank 1 or rank 2. - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename RV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > RV_Internal; - typedef Kokkos::View< - typename std::conditional::type, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XMV::device_type, Kokkos::MemoryTraits > + typedef Kokkos::View > XMV_Internal; RV_Internal R_internal = R; diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 5244c35e53..16d54e3dce 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -15,6 +15,7 @@ #include "Test_Blas1_nrm1.hpp" #include "Test_Blas1_nrm2_squared.hpp" #include "Test_Blas1_nrm2.hpp" +#include "Test_Blas1_nrm2w_squared.hpp" #include "Test_Blas1_nrm2w.hpp" #include "Test_Blas1_nrminf.hpp" #include "Test_Blas1_reciprocal.hpp" diff --git a/unit_test/blas/Test_Blas1_nrm2w_squared.hpp b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp new file mode 100644 index 0000000000..14f1c90766 --- /dev/null +++ b/unit_test/blas/Test_Blas1_nrm2w_squared.hpp @@ -0,0 +1,232 @@ +#include +#include +#include +#include +#include + +namespace Test { +template +void impl_test_nrm2w_squared(int N) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + ViewTypeA a("A", N); + ViewTypeA w("W", N); + + typename ViewTypeA::HostMirror h_a = Kokkos::create_mirror_view(a); + typename ViewTypeA::HostMirror h_w = Kokkos::create_mirror_view(w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(a, rand_pool, randStart, randEnd); + Kokkos::fill_random(w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_a, a); + Kokkos::deep_copy(h_w, w); + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + typename AT::mag_type expected_result = 0; + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i)) / AT::abs(h_w(i)); + expected_result += term * term; + } + + typename AT::mag_type nonconst_result = KokkosBlas::nrm2w_squared(a, w); + EXPECT_NEAR_KK(nonconst_result, expected_result, eps * expected_result); +} + +template +void impl_test_nrm2w_squared_mv(int N, int K) { + typedef typename ViewTypeA::value_type ScalarA; + typedef Kokkos::ArithTraits AT; + + typedef multivector_layout_adapter vfA_type; + + typename vfA_type::BaseType b_a("A", N, K); + typename vfA_type::BaseType b_w("W", N, K); + + ViewTypeA a = vfA_type::view(b_a); + ViewTypeA w = vfA_type::view(b_w); + + typedef multivector_layout_adapter h_vfA_type; + + typename h_vfA_type::BaseType h_b_a = Kokkos::create_mirror_view(b_a); + typename h_vfA_type::BaseType h_b_w = Kokkos::create_mirror_view(b_w); + + typename ViewTypeA::HostMirror h_a = h_vfA_type::view(h_b_a); + typename ViewTypeA::HostMirror h_w = h_vfA_type::view(h_b_w); + + Kokkos::Random_XorShift64_Pool rand_pool( + 13718); + + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a, rand_pool, randStart, randEnd); + Kokkos::fill_random(b_w, rand_pool, randStart, randEnd); + + Kokkos::deep_copy(h_b_a, b_a); + Kokkos::deep_copy(h_b_w, b_w); + + typename AT::mag_type* expected_result = new typename AT::mag_type[K]; + for (int j = 0; j < K; j++) { + expected_result[j] = typename AT::mag_type(); + for (int i = 0; i < N; i++) { + typename AT::mag_type term = AT::abs(h_a(i, j)) / AT::abs(h_w(i, j)); + expected_result[j] += term * term; + } + } + + double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; + + Kokkos::View r("Dot::Result", K); + KokkosBlas::nrm2w_squared(r, a, w); + auto r_host = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), r); + + for (int k = 0; k < K; k++) { + typename AT::mag_type nonconst_result = r_host(k); + EXPECT_NEAR_KK(nonconst_result, expected_result[k], + eps * expected_result[k]); + } + + delete[] expected_result; +} +} // namespace Test + +template +int test_nrm2w_squared() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared(0); + Test::impl_test_nrm2w_squared(13); + Test::impl_test_nrm2w_squared(1024); + // Test::impl_test_nrm2(132231); +#endif + + return 1; +} + +template +int test_nrm2w_squared_mv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ll; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_lr; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + +#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + typedef Kokkos::View view_type_a_ls; + Test::impl_test_nrm2w_squared_mv(0, 5); + Test::impl_test_nrm2w_squared_mv(13, 5); + Test::impl_test_nrm2w_squared_mv(1024, 5); + Test::impl_test_nrm2w_squared_mv(789, 1); + // Test::impl_test_nrm2w_squared_mv(132231,5); +#endif + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_float"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_float) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_float"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_double"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_double) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_double"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_complex_double"); + test_nrm2w_squared, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_complex_double) { + Kokkos::Profiling::pushRegion( + "KokkosBlas::Test::nrm2w_squared_mv_complex_double"); + test_nrm2w_squared_mv, TestExecSpace>(); + Kokkos::Profiling::popRegion(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_INT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +TEST_F(TestCategory, nrm2w_squared_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_int"); + test_nrm2w_squared(); + Kokkos::Profiling::popRegion(); +} +TEST_F(TestCategory, nrm2w_squared_mv_int) { + Kokkos::Profiling::pushRegion("KokkosBlas::Test::nrm2w_squared_mv_int"); + test_nrm2w_squared_mv(); + Kokkos::Profiling::popRegion(); +} +#endif From ec468479f2606287525331147be42b7d481d9a1e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 3 Mar 2022 11:00:50 -0700 Subject: [PATCH 024/261] .github/worksflows: Match cm_test_all_sandia ctest timeout --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 20aa0c123f..ffdc484346 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -83,4 +83,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure \ No newline at end of file + run: ctest -j2 --output-on-failure --timeout 2500 \ No newline at end of file From 7129f3b4242ddab0820804b5061421aa5f56f235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 7 Jan 2022 14:19:05 +0100 Subject: [PATCH 025/261] Refactor MKL implementation of SpGEMM --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 863 ++++++------------ 1 file changed, 283 insertions(+), 580 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 8eb0bd3930..1b22906ea3 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -47,634 +47,337 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include "mkl_spblas.h" -#include "mkl.h" #endif -#include "KokkosKernels_Utils.hpp" -#include - namespace KokkosSparse { - namespace Impl { -template -void mkl_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - in_row_index_view_type row_mapA, - in_nonzero_index_view_type entriesA, - - bool transposeA, bin_row_index_view_type row_mapB, - bin_nonzero_index_view_type entriesB, bool transposeB, - cin_row_index_view_type row_mapC, bool verbose = false) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - typedef typename KernelHandle::nnz_lno_t idx; - typedef typename KernelHandle::size_type size_type; - - typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; - typedef typename Kokkos::View - int_temp_work_view_t; - - typedef typename KernelHandle::nnz_scalar_t value_type; - - typedef typename KernelHandle::HandleExecSpace MyExecSpace; - /* - if (!( - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) ) - ){ - throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for - MKL\n"); return; - } - */ - if (std::is_same::value) { - int *a_xadj = NULL; - int *b_xadj = NULL; - int_temp_work_view_t a_xadj_v, b_xadj_v; - - if (std::is_same::value) { - a_xadj = (int *)row_mapA.data(); - b_xadj = (int *)row_mapB.data(); - } else { - // TODO test this case. - - Kokkos::Timer copy_time; - const int max_integer = 2147483647; - if (entriesB.extent(0) > max_integer || - entriesA.extent(0) > max_integer) { - throw std::runtime_error( - "MKL requires integer values for size type for SPGEMM. Copying to " - "integer will cause overflow.\n"); - return; - } - a_xadj_v = int_temp_work_view_t("tmpa", m + 1); - a_xadj = (int *)a_xadj_v.data(); - b_xadj_v = int_temp_work_view_t("tmpb", n + 1); - b_xadj = (int *)b_xadj_v.data(); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapA, a_xadj_v); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapB, b_xadj_v); - - if (verbose) - std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() - << std::endl; - } - - int *a_adj = (int *)entriesA.data(); - int *b_adj = (int *)entriesB.data(); - - std::vector tmp_values( - KOKKOSKERNELS_MACRO_MAX(entriesB.extent(0), entriesA.extent(0))); - value_type *ptmp_values = &(tmp_values[0]); - value_type *a_ew = ptmp_values; - value_type *b_ew = ptmp_values; - - sparse_matrix_t A; - sparse_matrix_t B; - sparse_matrix_t C; - - if (std::is_same::value) { - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (float *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (float *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } - - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } - - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual FLOAT MKL SPMM Time in symbolic:" - << timer1.seconds() << std::endl; - - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - float *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_row_index_view_type::non_const_type, - MyExecSpace>(m, rows_start, row_mapC); - idx nnz = row_mapC(m) = rows_end[m - 1]; - handle->set_c_nnz(nnz); - } - - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } - } else if (std::is_same::value) { - /* - std::cout << "create a" << std::endl; - std::cout << "m:" << m << " n:" << n << std::endl; - std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << - std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << - " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; - */ - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (double *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - // std::cout << "create b" << std::endl; - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (double *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } +KOKKOS_INLINE_FUNCTION +void mkl_call(sparse_status_t result, const char *err_msg) { + if (SPARSE_STATUS_SUCCESS != result) { + throw std::runtime_error(err_msg); + } +} - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } +template +class MKLSparseMatrix { + sparse_matrix_t mtx; + + public: + KOKKOS_INLINE_FUNCTION + MKLSparseMatrix(const MKL_INT m, const MKL_INT n, MKL_INT *xadj, MKL_INT *adj, + value_type *values); + + KOKKOS_INLINE_FUNCTION + static MKLSparseMatrix spmm( + sparse_operation_t operation, const MKLSparseMatrix &A, + const MKLSparseMatrix &B) { + sparse_matrix_t c; + mkl_call(mkl_sparse_spmm(operation, A.mtx, B.mtx, &c), + "mkl_sparse_spmm() failed!"); + return MKLSparseMatrix(c); + } - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" - << timer1.seconds() << std::endl; - mkl_free_buffers(); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() - << std::endl; + KOKKOS_INLINE_FUNCTION + void get(MKL_INT &rows, MKL_INT &cols, MKL_INT *&rows_start, + MKL_INT *&columns, value_type *&values); - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - double *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - if (handle->mkl_keep_output) { - Kokkos::Timer copy_time; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_row_index_view_type::non_const_type, - MyExecSpace>(m, rows_start, row_mapC); - idx nnz = row_mapC(m) = rows_end[m - 1]; - handle->set_c_nnz(nnz); - - double copy_time_d = copy_time.seconds(); - if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl; - } - } + KOKKOS_INLINE_FUNCTION + void destroy() { + mkl_call(mkl_sparse_destroy(mtx), "mkl_sparse_destroy() failed!"); + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } + private: + KOKKOS_INLINE_FUNCTION + MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} +}; + +template <> +KOKKOS_INLINE_FUNCTION MKLSparseMatrix::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + float *values) { + mkl_call(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, + xadj, xadj + 1, adj, values), + "mkl_sparse_s_create_csr() failed!"); +} - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } +template <> +KOKKOS_INLINE_FUNCTION MKLSparseMatrix::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + double *values) { + mkl_call(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, + xadj, xadj + 1, adj, values), + "mkl_sparse_d_create_csr() failed!"); +} - } else { - throw std::runtime_error( - "MKL requires float or double values. Complex values are not " - "implemented yet.\n"); - return; - } - } else { - throw std::runtime_error("MKL requires local ordinals to be integer.\n"); +template <> +KOKKOS_INLINE_FUNCTION void MKLSparseMatrix::get(MKL_INT &rows, + MKL_INT &cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + float *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + mkl_call(mkl_sparse_s_export_csr(mtx, &indexing, &rows, &cols, &rows_start, + &rows_end, &columns, &values), + "Failed to export matrix with mkl_sparse_s_export_csr()!"); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); return; } -#else - (void)handle; - (void)m; - (void)n; - (void)k; - (void)row_mapA; - (void)row_mapB; - (void)row_mapC; - (void)entriesA; - (void)entriesB; - (void)transposeA; - (void)transposeB; - (void)verbose; - throw std::runtime_error("MKL IS NOT DEFINED\n"); - // return; -#endif } -template < - typename KernelHandle, typename in_row_index_view_type, - typename in_nonzero_index_view_type, typename in_nonzero_value_view_type, - typename bin_row_index_view_type, typename bin_nonzero_index_view_type, - typename bin_nonzero_value_view_type, typename cin_row_index_view_type, - typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type> -void mkl_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - in_row_index_view_type row_mapA, - in_nonzero_index_view_type entriesA, - in_nonzero_value_view_type valuesA, - - bool transposeA, bin_row_index_view_type row_mapB, - bin_nonzero_index_view_type entriesB, - bin_nonzero_value_view_type valuesB, bool transposeB, - cin_row_index_view_type row_mapC, - cin_nonzero_index_view_type entriesC, - cin_nonzero_value_view_type valuesC, bool verbose = false) { -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +template <> +KOKKOS_INLINE_FUNCTION void MKLSparseMatrix::get(MKL_INT &rows, + MKL_INT &cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + double *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + mkl_call(mkl_sparse_d_export_csr(mtx, &indexing, &rows, &cols, &rows_start, + &rows_end, &columns, &values), + "Failed to export matrix with mkl_sparse_s_export_csr()!"); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} - typedef typename KernelHandle::nnz_lno_t idx; +template +class MKLApply { + public: + typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; - - typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; - typedef typename Kokkos::View - int_temp_work_view_t; - typedef typename KernelHandle::nnz_scalar_t value_type; - + typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; typedef typename KernelHandle::HandleExecSpace MyExecSpace; - /* - if (!( - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) ) - ){ - throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for - MKL\n"); return; - } - */ - if (std::is_same::value) { - int *a_xadj = NULL; - int *b_xadj = NULL; - int_temp_work_view_t a_xadj_v, b_xadj_v; - - if (std::is_same::value) { - a_xadj = (int *)row_mapA.data(); - b_xadj = (int *)row_mapB.data(); - } else { - // TODO test this case. - - Kokkos::Timer copy_time; - const int max_integer = 2147483647; - if (entriesB.extent(0) > max_integer || - entriesA.extent(0) > max_integer) { - throw std::runtime_error( - "MKL requires integer values for size type for SPGEMM. Copying to " - "integer will cause overflow.\n"); - return; - } - a_xadj_v = int_temp_work_view_t("tmpa", m + 1); - a_xadj = (int *)a_xadj_v.data(); - b_xadj_v = int_temp_work_view_t("tmpb", n + 1); - b_xadj = (int *)b_xadj_v.data(); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapA, a_xadj_v); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapB, b_xadj_v); - - if (verbose) - std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() - << std::endl; + typedef typename Kokkos::View int_tmp_view_t; + + public: + static void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, + nnz_lno_t k, a_rowmap_view_type row_mapA, + a_index_view_type entriesA, bool transposeA, + b_rowmap_view_type row_mapB, + b_index_view_type entriesB, bool transposeB, + c_rowmap_view_type row_mapC, bool verbose = false) { + if (m < 1 || n < 1 || k < 1 || row_mapA(m) < 1 || row_mapB(n) < 1) { + // set correct values in non-empty 0-nnz corner case + handle->set_c_nnz(0); + Kokkos::deep_copy(row_mapC, 0); + return; } - int *a_adj = (int *)entriesA.data(); - int *b_adj = (int *)entriesB.data(); + Kokkos::Timer timer; + using scalar_t = typename KernelHandle::nnz_scalar_t; + using tmp_values_type = + Kokkos::View; - const value_type *a_ew = valuesA.data(); - const value_type *b_ew = valuesB.data(); + const auto export_rowmap = [&](MKL_INT m, MKL_INT *rows_start, + MKL_INT *columns, scalar_t *values) { + if (handle->mkl_keep_output) { + Kokkos::Timer copy_time; + const nnz_lno_t nnz = rows_start[m]; + handle->set_c_nnz(nnz); + copy(m + 1, rows_start, row_mapC); + if (verbose) + std::cout << "\tMKL rowmap export time:" << copy_time.seconds() + << std::endl; + } + }; - sparse_matrix_t A; - sparse_matrix_t B; - sparse_matrix_t C; + // use dummy values for A and B inputs + tmp_values_type tmp_values( + Kokkos::ViewAllocateWithoutInitializing("tmp_values"), + KOKKOSKERNELS_MACRO_MAX(entriesA.extent(0), entriesB.extent(0))); - if (std::is_same::value) { - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (float *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } + apply(handle, m, n, k, row_mapA, entriesA, tmp_values, transposeA, row_mapB, + entriesB, tmp_values, transposeB, verbose, export_rowmap); - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (float *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } + if (verbose) + std::cout << "MKL symbolic time:" << timer.seconds() << std::endl; + } - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } + static void mkl_numeric( + KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_view_type row_mapA, a_index_view_type entriesA, + a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB, + b_index_view_type entriesB, b_values_view_type valuesB, bool transposeB, + c_rowmap_view_type row_mapC, c_index_view_type entriesC, + c_values_view_type valuesC, bool verbose = false) { + Kokkos::Timer timer; + + const auto export_values = + [&](MKL_INT m, MKL_INT *rows_start, MKL_INT *columns, + typename KernelHandle::nnz_scalar_t *values) { + if (handle->mkl_keep_output) { + Kokkos::Timer copy_time; + const nnz_lno_t nnz = rows_start[m]; + copy(nnz, columns, entriesC); + copy(nnz, values, valuesC); + if (verbose) + std::cout << "\tMKL values export time:" << copy_time.seconds() + << std::endl; + } + }; + + apply(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, + entriesB, valuesB, transposeB, verbose, export_values); + + if (verbose) + std::cout << "MKL numeric time:" << timer.seconds() << std::endl; + } - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds() - << std::endl; + private: + static constexpr int max_integer = 2147483647; - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - float *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - - // KokkosKernels::Impl::copy_vector (m, rows_start, - // row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1]; - idx nnz = rows_end[m - 1]; - using non_const_size_type = - typename cin_row_index_view_type::non_const_value_type; - auto *tmpPtr = const_cast(row_mapC.data()); - tmpPtr[m] = nnz; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_nonzero_index_view_type::non_const_type, - MyExecSpace>(nnz, columns, entriesC); - KokkosKernels::Impl::copy_vector< - float *, typename cin_nonzero_value_view_type::non_const_type, - MyExecSpace>(nnz, values, valuesC); - } + private: + template + static void apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_view_type row_mapA, a_index_view_type entriesA, + a_values_view_type valuesA, - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } + bool transposeA, b_rowmap_view_type row_mapB, + b_index_view_type entriesB, b_values_view_type valuesB, + bool transposeB, bool verbose, const CB &callback) { + if (!std::is_same::value) { + throw std::runtime_error("MKL requires local ordinals to be integer.\n"); + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } - } else if (std::is_same::value) { - /* - std::cout << "create a" << std::endl; - std::cout << "m:" << m << " n:" << n << std::endl; - std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << - std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << - " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; - */ - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (double *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } + if (m < 1 || n < 1 || k < 1 || row_mapA(m) < 1 || row_mapB(n) < 1) { + return; + } - // std::cout << "create b" << std::endl; - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (double *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } + int *a_xadj = (int *)row_mapA.data(); + int *b_xadj = (int *)row_mapB.data(); + int_tmp_view_t a_xadj_v, b_xadj_v; - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { + if (!std::is_same::value) { + if (entriesA.extent(0) > max_integer || + entriesB.extent(0) > max_integer) { throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; + "MKL requires integer values for size type for SPGEMM. Copying " + "to " + "integer will cause overflow.\n"); } + static_assert( + std::is_same::value, + "deep_copy requires non-const destination type"); - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" - << timer1.seconds() << std::endl; - - mkl_free_buffers(); + Kokkos::Timer copy_time; + a_xadj_v = int_tmp_view_t("tmpa", m + 1); + b_xadj_v = int_tmp_view_t("tmpb", n + 1); + Kokkos::deep_copy(a_xadj_v, row_mapA); + Kokkos::deep_copy(b_xadj_v, row_mapB); + a_xadj = (int *)a_xadj_v.data(); + b_xadj = (int *)b_xadj_v.data(); if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() - << std::endl; - - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - double *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - if (handle->mkl_keep_output) { - Kokkos::Timer copy_time; - - // KokkosKernels::Impl::copy_vector (m, - // rows_start, row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1]; - idx nnz = rows_end[m - 1]; - using non_const_size_type = - typename cin_row_index_view_type::non_const_value_type; - auto *tmpPtr = const_cast(row_mapC.data()); - tmpPtr[m] = nnz; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_nonzero_index_view_type::non_const_type, - MyExecSpace>(nnz, columns, entriesC); - KokkosKernels::Impl::copy_vector< - double *, typename cin_nonzero_value_view_type::non_const_type, - MyExecSpace>(nnz, values, valuesC); - double copy_time_d = copy_time.seconds(); - if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl; - } - } + std::cout << "\tMKL int-type temp rowmap copy time:" + << copy_time.seconds() << std::endl; + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } + value_type *a_ew = (value_type *)valuesA.data(); + value_type *b_ew = (value_type *)valuesB.data(); - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } + using Matrix = MKLSparseMatrix; + Matrix A(m, n, a_xadj, (int *)(entriesA.data()), a_ew); + Matrix B(n, k, b_xadj, (int *)entriesB.data(), b_ew); + sparse_operation_t operation; + if (transposeA && transposeB) { + operation = SPARSE_OPERATION_TRANSPOSE; + } else if (!(transposeA || transposeB)) { + operation = SPARSE_OPERATION_NON_TRANSPOSE; } else { throw std::runtime_error( - "MKL requires float or double values. Complex values are not " - "implemented yet.\n"); - return; + "MKL either transpose both matrices, or none for SPGEMM\n"); } - } else { - throw std::runtime_error("MKL requires local ordinals to be integer.\n"); - return; + + Kokkos::Timer timer1; + Matrix C = Matrix::spmm(operation, A, B); + if (verbose) { + std::cout << "\tMKL spmm ("; + if (std::is_same::value) + std::cout << "FLOAT"; + else if (std::is_same::value) + std::cout << "DOUBLE"; + else + std::cout << "?"; + std::cout << ") time:" << timer1.seconds() << std::endl; + } + + MKL_INT c_rows, c_cols, *rows_start, *columns; + value_type *values; + C.get(c_rows, c_cols, rows_start, columns, values); + callback(m, rows_start, columns, values); + + A.destroy(); + B.destroy(); + C.destroy(); + } + + template + KOKKOS_INLINE_FUNCTION static void copy(size_t num_elems, from_type from, + to_type to) { + KokkosKernels::Impl::copy_vector(num_elems, + from, to); } +}; +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + +template +void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_type row_mapA, a_index_type entriesA, + bool transposeA, b_rowmap_type row_mapB, + b_index_type entriesB, bool transposeB, + c_rowmap_type row_mapC, bool verbose = false) { +#ifndef KOKKOSKERNELS_ENABLE_TPL_MKL + throw std::runtime_error("MKL was not enabled in this build!"); +#else + using values_type = typename KernelHandle::scalar_temp_work_view_t; + using c_index_type = b_index_type; + using mkl = MKLApply; + mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, + entriesB, transposeB, row_mapC, verbose); +#endif +} + +template +void mkl_apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_type row_mapA, a_index_type entriesA, + a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB, + b_index_type entriesB, b_values_type valuesB, bool transposeB, + c_rowmap_type row_mapC, c_index_type entriesC, + c_values_type valuesC, bool verbose = false) { +#ifndef KOKKOSKERNELS_ENABLE_TPL_MKL + throw std::runtime_error("MKL was not enabled in this build!"); #else - (void)handle; - (void)m; - (void)n; - (void)k; - (void)row_mapA; - (void)row_mapB; - (void)row_mapC; - (void)entriesA; - (void)entriesB; - (void)entriesC; - (void)valuesA; - (void)valuesB; - (void)valuesC; - (void)transposeA; - (void)transposeB; - (void)verbose; - throw std::runtime_error("MKL IS NOT DEFINED\n"); - // return; + using mkl = MKLApply; + mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, + valuesC, verbose); #endif } + } // namespace Impl } // namespace KokkosSparse From 272461125c6ea2afae9c6ea1c79c02ad89c75cc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 7 Jan 2022 14:19:05 +0100 Subject: [PATCH 026/261] Fix MKL dispatch in SpGEMM unit test --- unit_test/sparse/Test_Sparse_spgemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index b84ef6acc4..e5ab088bdc 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -280,7 +280,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ }; -#ifdef HAVE_KOKKOSKERNELS_MKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL algorithms.push_back(SPGEMM_MKL); #endif From 5d535fea8744262e775abd3e31b53b4fdea64554 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 7 Jan 2022 14:19:05 +0100 Subject: [PATCH 027/261] Fixed inlining: don't comile exception throwing MKL wrappers for GPU --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 55 ++++++++----------- 1 file changed, 23 insertions(+), 32 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 1b22906ea3..44ae49fc34 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -54,8 +54,7 @@ namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -KOKKOS_INLINE_FUNCTION -void mkl_call(sparse_status_t result, const char *err_msg) { +inline void mkl_call(sparse_status_t result, const char *err_msg) { if (SPARSE_STATUS_SUCCESS != result) { throw std::runtime_error(err_msg); } @@ -66,12 +65,10 @@ class MKLSparseMatrix { sparse_matrix_t mtx; public: - KOKKOS_INLINE_FUNCTION - MKLSparseMatrix(const MKL_INT m, const MKL_INT n, MKL_INT *xadj, MKL_INT *adj, - value_type *values); + inline MKLSparseMatrix(const MKL_INT m, const MKL_INT n, MKL_INT *xadj, + MKL_INT *adj, value_type *values); - KOKKOS_INLINE_FUNCTION - static MKLSparseMatrix spmm( + inline static MKLSparseMatrix spmm( sparse_operation_t operation, const MKLSparseMatrix &A, const MKLSparseMatrix &B) { sparse_matrix_t c; @@ -80,44 +77,41 @@ class MKLSparseMatrix { return MKLSparseMatrix(c); } - KOKKOS_INLINE_FUNCTION - void get(MKL_INT &rows, MKL_INT &cols, MKL_INT *&rows_start, - MKL_INT *&columns, value_type *&values); + inline void get(MKL_INT &rows, MKL_INT &cols, MKL_INT *&rows_start, + MKL_INT *&columns, value_type *&values); - KOKKOS_INLINE_FUNCTION - void destroy() { + inline void destroy() { mkl_call(mkl_sparse_destroy(mtx), "mkl_sparse_destroy() failed!"); } private: - KOKKOS_INLINE_FUNCTION - MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} + inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} }; template <> -KOKKOS_INLINE_FUNCTION MKLSparseMatrix::MKLSparseMatrix( - const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, - float *values) { +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + float *values) { mkl_call(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values), "mkl_sparse_s_create_csr() failed!"); } template <> -KOKKOS_INLINE_FUNCTION MKLSparseMatrix::MKLSparseMatrix( - const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, - double *values) { +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + double *values) { mkl_call(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values), "mkl_sparse_d_create_csr() failed!"); } template <> -KOKKOS_INLINE_FUNCTION void MKLSparseMatrix::get(MKL_INT &rows, - MKL_INT &cols, - MKL_INT *&rows_start, - MKL_INT *&columns, - float *&values) { +inline void MKLSparseMatrix::get(MKL_INT &rows, MKL_INT &cols, + MKL_INT *&rows_start, MKL_INT *&columns, + float *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; mkl_call(mkl_sparse_s_export_csr(mtx, &indexing, &rows, &cols, &rows_start, @@ -131,11 +125,9 @@ KOKKOS_INLINE_FUNCTION void MKLSparseMatrix::get(MKL_INT &rows, } template <> -KOKKOS_INLINE_FUNCTION void MKLSparseMatrix::get(MKL_INT &rows, - MKL_INT &cols, - MKL_INT *&rows_start, - MKL_INT *&columns, - double *&values) { +inline void MKLSparseMatrix::get(MKL_INT &rows, MKL_INT &cols, + MKL_INT *&rows_start, + MKL_INT *&columns, double *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; mkl_call(mkl_sparse_d_export_csr(mtx, &indexing, &rows, &cols, &rows_start, @@ -326,8 +318,7 @@ class MKLApply { } template - KOKKOS_INLINE_FUNCTION static void copy(size_t num_elems, from_type from, - to_type to) { + inline static void copy(size_t num_elems, from_type from, to_type to) { KokkosKernels::Impl::copy_vector(num_elems, from, to); } From 3556dffffc2cb4088e883bf55e805f227885a8a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 7 Jan 2022 14:19:05 +0100 Subject: [PATCH 028/261] Support GPU memory space in MKL spgemm --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 81 ++++++++++++------- unit_test/sparse/Test_Sparse_spgemm.hpp | 6 -- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 44ae49fc34..9bc4a9faac 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -150,9 +150,8 @@ class MKLApply { typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_scalar_t value_type; - typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; typedef typename KernelHandle::HandleExecSpace MyExecSpace; - typedef typename Kokkos::View int_tmp_view_t; + typedef typename Kokkos::View int_tmp_view_t; public: static void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, @@ -161,7 +160,8 @@ class MKLApply { b_rowmap_view_type row_mapB, b_index_view_type entriesB, bool transposeB, c_rowmap_view_type row_mapC, bool verbose = false) { - if (m < 1 || n < 1 || k < 1 || row_mapA(m) < 1 || row_mapB(n) < 1) { + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) { // set correct values in non-empty 0-nnz corner case handle->set_c_nnz(0); Kokkos::deep_copy(row_mapC, 0); @@ -170,8 +170,6 @@ class MKLApply { Kokkos::Timer timer; using scalar_t = typename KernelHandle::nnz_scalar_t; - using tmp_values_type = - Kokkos::View; const auto export_rowmap = [&](MKL_INT m, MKL_INT *rows_start, MKL_INT *columns, scalar_t *values) { @@ -179,7 +177,7 @@ class MKLApply { Kokkos::Timer copy_time; const nnz_lno_t nnz = rows_start[m]; handle->set_c_nnz(nnz); - copy(m + 1, rows_start, row_mapC); + copy(make_host_view(rows_start, m + 1), row_mapC); if (verbose) std::cout << "\tMKL rowmap export time:" << copy_time.seconds() << std::endl; @@ -187,12 +185,15 @@ class MKLApply { }; // use dummy values for A and B inputs - tmp_values_type tmp_values( - Kokkos::ViewAllocateWithoutInitializing("tmp_values"), - KOKKOSKERNELS_MACRO_MAX(entriesA.extent(0), entriesB.extent(0))); + a_values_view_type tmp_valsA( + Kokkos::ViewAllocateWithoutInitializing("tmp_valuesA"), + entriesA.extent(0)); + b_values_view_type tmp_valsB( + Kokkos::ViewAllocateWithoutInitializing("tmp_valuesB"), + entriesB.extent(0)); - apply(handle, m, n, k, row_mapA, entriesA, tmp_values, transposeA, row_mapB, - entriesB, tmp_values, transposeB, verbose, export_rowmap); + apply(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB, + entriesB, tmp_valsB, transposeB, verbose, export_rowmap); if (verbose) std::cout << "MKL symbolic time:" << timer.seconds() << std::endl; @@ -213,8 +214,8 @@ class MKLApply { if (handle->mkl_keep_output) { Kokkos::Timer copy_time; const nnz_lno_t nnz = rows_start[m]; - copy(nnz, columns, entriesC); - copy(nnz, values, valuesC); + copy(make_host_view(columns, nnz), entriesC); + copy(make_host_view(values, nnz), valuesC); if (verbose) std::cout << "\tMKL values export time:" << copy_time.seconds() << std::endl; @@ -244,12 +245,19 @@ class MKLApply { throw std::runtime_error("MKL requires local ordinals to be integer.\n"); } - if (m < 1 || n < 1 || k < 1 || row_mapA(m) < 1 || row_mapB(n) < 1) { + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) { return; } - int *a_xadj = (int *)row_mapA.data(); - int *b_xadj = (int *)row_mapB.data(); + const auto create_mirror = [](auto view) { + return Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); + }; + + auto h_rowsA = create_mirror(row_mapA); + auto h_rowsB = create_mirror(row_mapB); + const int *a_xadj = reinterpret_cast(h_rowsA.data()); + const int *b_xadj = reinterpret_cast(h_rowsB.data()); int_tmp_view_t a_xadj_v, b_xadj_v; if (!std::is_same::value) { @@ -268,8 +276,8 @@ class MKLApply { Kokkos::Timer copy_time; a_xadj_v = int_tmp_view_t("tmpa", m + 1); b_xadj_v = int_tmp_view_t("tmpb", n + 1); - Kokkos::deep_copy(a_xadj_v, row_mapA); - Kokkos::deep_copy(b_xadj_v, row_mapB); + Kokkos::deep_copy(a_xadj_v, h_rowsA); + Kokkos::deep_copy(b_xadj_v, h_rowsB); a_xadj = (int *)a_xadj_v.data(); b_xadj = (int *)b_xadj_v.data(); if (verbose) @@ -277,12 +285,20 @@ class MKLApply { << copy_time.seconds() << std::endl; } - value_type *a_ew = (value_type *)valuesA.data(); - value_type *b_ew = (value_type *)valuesB.data(); - + auto h_valsA = create_mirror(valuesA); + auto h_valsB = create_mirror(valuesB); + auto h_entriesA = create_mirror(entriesA); + auto h_entriesB = create_mirror(entriesB); + const int *a_adj = h_entriesA.data(); + const int *b_adj = h_entriesB.data(); + const value_type *a_ew = h_valsA.data(); + const value_type *b_ew = h_valsB.data(); + + // Hack: we discard const with pointer casts here to work around MKL + // requiring mutable input and our symbolic interface not providing it using Matrix = MKLSparseMatrix; - Matrix A(m, n, a_xadj, (int *)(entriesA.data()), a_ew); - Matrix B(n, k, b_xadj, (int *)entriesB.data(), b_ew); + Matrix A(m, n, (int *)a_xadj, (int *)a_adj, (value_type *)a_ew); + Matrix B(n, k, (int *)b_xadj, (int *)b_adj, (value_type *)b_ew); sparse_operation_t operation; if (transposeA && transposeB) { @@ -317,10 +333,21 @@ class MKLApply { C.destroy(); } - template - inline static void copy(size_t num_elems, from_type from, to_type to) { - KokkosKernels::Impl::copy_vector(num_elems, - from, to); + template + inline static void copy(from_view_type from, dst_view_type to) { + auto h_from = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), from); + auto h_to = Kokkos::create_mirror_view(Kokkos::HostSpace(), to); + Kokkos::deep_copy(h_to, h_from); // view copy (for different element types) + Kokkos::deep_copy(to, h_to); + Kokkos::fence(); + } + + template + inline static decltype(auto) make_host_view(const T *data, size_t num_elems) { + using device_type = + Kokkos::Device; + return Kokkos::View(data, num_elems); } }; #endif // KOKKOSKERNELS_ENABLE_TPL_MKL diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index e5ab088bdc..cb3d04b019 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -315,12 +315,6 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, if (A.values.extent(0) > max_integer) { is_expected_to_fail = true; } - - if (!(Kokkos::SpaceAccessibility< - typename Kokkos::HostSpace::execution_space, - typename device::memory_space>::accessible)) { - is_expected_to_fail = true; - } break; case SPGEMM_KK: algo = "SPGEMM_KK"; break; From 0ba8b395bdb56f027c86f69c4f8e50521aff63f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 15:56:50 +0100 Subject: [PATCH 029/261] fix -Wunused-parameter errors --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 39 ++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 9bc4a9faac..13d0c00e1e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -172,7 +172,8 @@ class MKLApply { using scalar_t = typename KernelHandle::nnz_scalar_t; const auto export_rowmap = [&](MKL_INT m, MKL_INT *rows_start, - MKL_INT *columns, scalar_t *values) { + MKL_INT * /*columns*/, + scalar_t * /*values*/) { if (handle->mkl_keep_output) { Kokkos::Timer copy_time; const nnz_lno_t nnz = rows_start[m]; @@ -204,7 +205,7 @@ class MKLApply { a_rowmap_view_type row_mapA, a_index_view_type entriesA, a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB, b_index_view_type entriesB, b_values_view_type valuesB, bool transposeB, - c_rowmap_view_type row_mapC, c_index_view_type entriesC, + c_rowmap_view_type /* row_mapC */, c_index_view_type entriesC, c_values_view_type valuesC, bool verbose = false) { Kokkos::Timer timer; @@ -234,9 +235,9 @@ class MKLApply { private: template - static void apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, - a_rowmap_view_type row_mapA, a_index_view_type entriesA, - a_values_view_type valuesA, + static void apply(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n, + nnz_lno_t k, a_rowmap_view_type row_mapA, + a_index_view_type entriesA, a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB, b_index_view_type entriesB, b_values_view_type valuesB, @@ -362,6 +363,18 @@ void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, c_rowmap_type row_mapC, bool verbose = false) { #ifndef KOKKOSKERNELS_ENABLE_TPL_MKL throw std::runtime_error("MKL was not enabled in this build!"); + (void)handle; + (void)m; + (void)n; + (void)k; + (void)row_mapA; + (void)entriesA; + (void)transposeA; + (void)row_mapB; + (void)entriesB; + (void)transposeB; + (void)row_mapC; + (void)verbose; #else using values_type = typename KernelHandle::scalar_temp_work_view_t; using c_index_type = b_index_type; @@ -386,6 +399,22 @@ void mkl_apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, c_values_type valuesC, bool verbose = false) { #ifndef KOKKOSKERNELS_ENABLE_TPL_MKL throw std::runtime_error("MKL was not enabled in this build!"); + (void)handle; + (void)m; + (void)n; + (void)k; + (void)row_mapA; + (void)entriesA; + (void)valuesA; + (void)transposeA; + (void)row_mapB; + (void)entriesB; + (void)valuesB; + (void)transposeB; + (void)row_mapC; + (void)entriesC; + (void)valuesC; + (void)verbose; #else using mkl = MKLApply Date: Wed, 2 Feb 2022 21:51:30 +0100 Subject: [PATCH 030/261] Fix name shadowing --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 13d0c00e1e..e6babd1a30 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -171,14 +171,14 @@ class MKLApply { Kokkos::Timer timer; using scalar_t = typename KernelHandle::nnz_scalar_t; - const auto export_rowmap = [&](MKL_INT m, MKL_INT *rows_start, + const auto export_rowmap = [&](MKL_INT num_rows, MKL_INT *rows_start, MKL_INT * /*columns*/, scalar_t * /*values*/) { if (handle->mkl_keep_output) { Kokkos::Timer copy_time; - const nnz_lno_t nnz = rows_start[m]; + const nnz_lno_t nnz = rows_start[num_rows]; handle->set_c_nnz(nnz); - copy(make_host_view(rows_start, m + 1), row_mapC); + copy(make_host_view(rows_start, num_rows + 1), row_mapC); if (verbose) std::cout << "\tMKL rowmap export time:" << copy_time.seconds() << std::endl; @@ -210,11 +210,11 @@ class MKLApply { Kokkos::Timer timer; const auto export_values = - [&](MKL_INT m, MKL_INT *rows_start, MKL_INT *columns, + [&](MKL_INT num_rows, MKL_INT *rows_start, MKL_INT *columns, typename KernelHandle::nnz_scalar_t *values) { if (handle->mkl_keep_output) { Kokkos::Timer copy_time; - const nnz_lno_t nnz = rows_start[m]; + const nnz_lno_t nnz = rows_start[num_rows]; copy(make_host_view(columns, nnz), entriesC); copy(make_host_view(values, nnz), valuesC); if (verbose) From 850db252d3e5be106e3c9acfcae44f978284c87a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 2 Feb 2022 21:51:58 +0100 Subject: [PATCH 031/261] Remove unnecessary fence --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index e6babd1a30..4f73703065 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -341,7 +341,6 @@ class MKLApply { auto h_to = Kokkos::create_mirror_view(Kokkos::HostSpace(), to); Kokkos::deep_copy(h_to, h_from); // view copy (for different element types) Kokkos::deep_copy(to, h_to); - Kokkos::fence(); } template From 62f0549de7aab3e7e7d1924c2dbfe276c24373a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 2 Feb 2022 21:52:30 +0100 Subject: [PATCH 032/261] Clean up make_host_view() --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 4f73703065..9770465eb3 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -343,11 +343,10 @@ class MKLApply { Kokkos::deep_copy(to, h_to); } - template - inline static decltype(auto) make_host_view(const T *data, size_t num_elems) { - using device_type = - Kokkos::Device; - return Kokkos::View(data, num_elems); + template > + inline static view_type make_host_view(const T *data, size_t num_elems) { + return view_type(data, num_elems); } }; #endif // KOKKOSKERNELS_ENABLE_TPL_MKL From 146fcfe649228fdad5950a573bf1002e6bfaf6d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 2 Feb 2022 22:01:18 +0100 Subject: [PATCH 033/261] Rename get() to export_data() --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 9770465eb3..d0b36c2a50 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -77,8 +77,9 @@ class MKLSparseMatrix { return MKLSparseMatrix(c); } - inline void get(MKL_INT &rows, MKL_INT &cols, MKL_INT *&rows_start, - MKL_INT *&columns, value_type *&values); + inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, + MKL_INT *&rows_start, MKL_INT *&columns, + value_type *&values); inline void destroy() { mkl_call(mkl_sparse_destroy(mtx), "mkl_sparse_destroy() failed!"); @@ -109,13 +110,15 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, } template <> -inline void MKLSparseMatrix::get(MKL_INT &rows, MKL_INT &cols, - MKL_INT *&rows_start, MKL_INT *&columns, - float *&values) { +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + float *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - mkl_call(mkl_sparse_s_export_csr(mtx, &indexing, &rows, &cols, &rows_start, - &rows_end, &columns, &values), + mkl_call(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, &values), "Failed to export matrix with mkl_sparse_s_export_csr()!"); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( @@ -125,13 +128,15 @@ inline void MKLSparseMatrix::get(MKL_INT &rows, MKL_INT &cols, } template <> -inline void MKLSparseMatrix::get(MKL_INT &rows, MKL_INT &cols, - MKL_INT *&rows_start, - MKL_INT *&columns, double *&values) { +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + double *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - mkl_call(mkl_sparse_d_export_csr(mtx, &indexing, &rows, &cols, &rows_start, - &rows_end, &columns, &values), + mkl_call(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, &values), "Failed to export matrix with mkl_sparse_s_export_csr()!"); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( @@ -324,9 +329,9 @@ class MKLApply { std::cout << ") time:" << timer1.seconds() << std::endl; } - MKL_INT c_rows, c_cols, *rows_start, *columns; + MKL_INT num_rows, num_cols, *rows_start, *columns; value_type *values; - C.get(c_rows, c_cols, rows_start, columns, values); + C.export_data(num_rows, num_cols, rows_start, columns, values); callback(m, rows_start, columns, values); A.destroy(); From 102eb6f44865510fbd3d831fd4316c68538e4a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 17 Feb 2022 13:25:24 +0100 Subject: [PATCH 034/261] Fix -Wunused-parameter errors --- .../impl/KokkosSparse_spgemm_mkl2phase_impl.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp index 5715c7f098..90c35dbaf8 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp @@ -302,6 +302,11 @@ void mkl2phase_symbolic( (void)transposeA; (void)transposeB; (void)verbose; + (void)a_xadj; + (void)b_xadj; + (void)c_xadj; + (void)a_adj; + (void)b_adj; #endif } else { @@ -351,9 +356,7 @@ void mkl2phase_apply( typename KernelHandle::HandlePersistentMemorySpace; using int_persistent_work_view_t = typename Kokkos::View; - using MyExecSpace = typename KernelHandle::HandleExecSpace; - using value_type = typename KernelHandle::nnz_scalar_t; - using idx = typename KernelHandle::nnz_lno_t; + using idx = typename KernelHandle::nnz_lno_t; if (std::is_same::value) { int *a_xadj = (int *)row_mapA.data(); @@ -639,6 +642,11 @@ void mkl2phase_apply( (void)transposeA; (void)transposeB; (void)verbose; + (void)a_xadj; + (void)b_xadj; + (void)c_xadj; + (void)a_adj; + (void)b_adj; #endif // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2 } else { (void)m; From 67a603d0b5808e63070b3568bb7ee67bbf85b06a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 17 Feb 2022 13:48:53 +0100 Subject: [PATCH 035/261] Gather MKL utilities within dedicated header --- perf_test/sparse/KokkosSparse_spadd.cpp | 30 ++----- src/common/KokkosKernels_SparseUtils_mkl.hpp | 87 +++++++++++++++++++ ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 54 ++++-------- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 38 ++------ .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 38 ++++---- 5 files changed, 137 insertions(+), 110 deletions(-) create mode 100644 src/common/KokkosKernels_SparseUtils_mkl.hpp diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 7b0bd42d2a..49034930e6 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -47,6 +47,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosKernels_SparseUtils_mkl.hpp" #include "KokkosSparse_spadd.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -57,21 +58,6 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include #include - -inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus, - const char* name, - const char* file = nullptr, - const int line = 0) { - if (SPARSE_STATUS_SUCCESS != mklStatus) { - std::ostringstream oss; - oss << "MKL call \"" << name << "\" encountered error at " << file << ":" - << line << '\n'; - Kokkos::abort(oss.str().c_str()); - } -} - -#define SPADD_MKL_SAFE_CALL(call) \ - spadd_mkl_internal_safe_call(call, #call, __FILE__, __LINE__) #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) && \ @@ -259,11 +245,11 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())); - SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); @@ -326,9 +312,9 @@ void run_experiment(const Params& params) { #endif } else if (params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, - Amkl, 1.0, Bmkl, &Cmkl)); - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, + 1.0, Bmkl, &Cmkl)); + MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); #endif } else { spadd_numeric( @@ -351,8 +337,8 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if (params.use_mkl) { - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); + MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); + MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); } #endif diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp new file mode 100644 index 0000000000..7085851092 --- /dev/null +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -0,0 +1,87 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP +#define _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP + +#include "KokkosKernels_config.h" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#include + +namespace KokkosSparse { +namespace Impl { + +inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name, + const char *file = nullptr, + const int line = 0) { + if (SPARSE_STATUS_SUCCESS != mkl_status) { + std::ostringstream oss; + oss << "MKL call \"" << name << "\" encountered error at " << file << ":" + << line << '\n'; + Kokkos::abort(oss.str().c_str()); + } +} + +#define MKL_SAFE_CALL(call) \ + KokkosSparse::Impl::mkl_internal_safe_call(call, #call, __FILE__, __LINE__) + +inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': return SPARSE_OPERATION_TRANSPOSE; + case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument( + "Invalid mode for MKL (should be one of N, T, H)"); +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + +#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP \ No newline at end of file diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index a6eec44449..d3c15e0267 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -46,6 +46,7 @@ #define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_SparseUtils_mkl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include @@ -57,26 +58,7 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -namespace BSR { -inline void mkl_safe_call(int errcode) { - if (errcode != SPARSE_STATUS_SUCCESS) - throw std::runtime_error("MKL returned non-success error code"); -} - -inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; - case 'T': return SPARSE_OPERATION_TRANSPOSE; - case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} -} // namespace BSR - -using BSR::mkl_safe_call; -using BSR::mode_kk_to_mkl; +using KokkosSparse::Impl::mode_kk_to_mkl; inline matrix_descr getDescription() { matrix_descr A_descr; @@ -91,13 +73,13 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, const int* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_s_create_bsr( + MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, @@ -106,13 +88,13 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, double* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_d_create_bsr( + MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, @@ -123,7 +105,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_c_create_bsr( + MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); @@ -131,7 +113,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); } @@ -144,7 +126,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_z_create_bsr( + MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); @@ -152,7 +134,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, matrix_descr A_descr = getDescription(); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); } @@ -163,13 +145,13 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, const float* Avalues, const float* x, int colx, int ldx, float* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_s_create_bsr( + MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, ldy)); } @@ -180,13 +162,13 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_d_create_bsr( + MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, ldy)); } @@ -200,7 +182,7 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_c_create_bsr( + MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); @@ -208,7 +190,7 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); matrix_descr A_descr = getDescription(); - mkl_safe_call( + MKL_SAFE_CALL( mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); @@ -221,7 +203,7 @@ inline void spm_mv_block_impl_mkl( const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_z_create_bsr( + MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); @@ -229,7 +211,7 @@ inline void spm_mv_block_impl_mkl( matrix_descr A_descr = getDescription(); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call( + MKL_SAFE_CALL( mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 17a72b2ad3..bacc749840 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -530,6 +530,7 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include +#include "KokkosKernels_SparseUtils_mkl.hpp" namespace KokkosSparse { namespace Impl { @@ -537,27 +538,6 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -// Note 12/03/21 - lbv: -// mkl_safe_call and mode_kk_to_mkl should -// be moved to some sparse or mkl utility -// header. It is likely that these will be -// reused for other kernels. -inline void mkl_safe_call(int errcode) { - if (errcode != SPARSE_STATUS_SUCCESS) - throw std::runtime_error("MKL returned non-success error code"); -} - -inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; - case 'T': return SPARSE_OPERATION_TRANSPOSE; - case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} - inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y) { @@ -566,11 +546,11 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_s_create_csr( + MKL_SAFE_CALL(mkl_sparse_s_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, @@ -581,11 +561,11 @@ inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_d_create_csr( + MKL_SAFE_CALL(mkl_sparse_d_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, @@ -599,13 +579,13 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_c_create_csr( + MKL_SAFE_CALL(mkl_sparse_c_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); } @@ -621,13 +601,13 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_z_create_csr( + MKL_SAFE_CALL(mkl_sparse_z_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, + MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); } diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index d0b36c2a50..50bf840e58 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -45,6 +45,9 @@ #ifndef _KOKKOSSPGEMMMKL_HPP #define _KOKKOSSPGEMMMKL_HPP +#include "KokkosKernels_config.h" +#include "KokkosKernels_SparseUtils_mkl.hpp" + #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include "mkl_spblas.h" #endif @@ -54,12 +57,6 @@ namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL -inline void mkl_call(sparse_status_t result, const char *err_msg) { - if (SPARSE_STATUS_SUCCESS != result) { - throw std::runtime_error(err_msg); - } -} - template class MKLSparseMatrix { sparse_matrix_t mtx; @@ -72,8 +69,7 @@ class MKLSparseMatrix { sparse_operation_t operation, const MKLSparseMatrix &A, const MKLSparseMatrix &B) { sparse_matrix_t c; - mkl_call(mkl_sparse_spmm(operation, A.mtx, B.mtx, &c), - "mkl_sparse_spmm() failed!"); + MKL_SAFE_CALL(mkl_sparse_spmm(operation, A.mtx, B.mtx, &c)); return MKLSparseMatrix(c); } @@ -81,9 +77,7 @@ class MKLSparseMatrix { MKL_INT *&rows_start, MKL_INT *&columns, value_type *&values); - inline void destroy() { - mkl_call(mkl_sparse_destroy(mtx), "mkl_sparse_destroy() failed!"); - } + inline void destroy() { MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); } private: inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} @@ -94,9 +88,8 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, float *values) { - mkl_call(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, - xadj, xadj + 1, adj, values), - "mkl_sparse_s_create_csr() failed!"); + MKL_SAFE_CALL(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, + cols, xadj, xadj + 1, adj, values)); } template <> @@ -104,9 +97,8 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, double *values) { - mkl_call(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, - xadj, xadj + 1, adj, values), - "mkl_sparse_d_create_csr() failed!"); + MKL_SAFE_CALL(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, + cols, xadj, xadj + 1, adj, values)); } template <> @@ -117,9 +109,9 @@ inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, float *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - mkl_call(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, &values), - "Failed to export matrix with mkl_sparse_s_export_csr()!"); + MKL_SAFE_CALL(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, + &values)); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( "Expected zero based indexing in exported MKL sparse matrix\n"); @@ -135,9 +127,9 @@ inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, double *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - mkl_call(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, &values), - "Failed to export matrix with mkl_sparse_s_export_csr()!"); + MKL_SAFE_CALL(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, + &values)); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( "Expected zero based indexing in exported MKL sparse matrix\n"); From 05293435613e65e0a865e595b8b5c373424368eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 17 Feb 2022 14:51:27 +0100 Subject: [PATCH 036/261] Move MKLSparseMatrix to MKL utils header --- src/common/KokkosKernels_SparseUtils_mkl.hpp | 79 +++++++++++++++++ .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 86 ++----------------- 2 files changed, 87 insertions(+), 78 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp index 7085851092..a2ab16fba9 100644 --- a/src/common/KokkosKernels_SparseUtils_mkl.hpp +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -79,6 +79,85 @@ inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { "Invalid mode for MKL (should be one of N, T, H)"); } +// MKLSparseMatrix provides thin wrapper around MKL matrix handle +// (sparse_matrix_t) and encapsulates MKL call dispatches related to details +// like value_type, allowing simple client code in kernels. +template +class MKLSparseMatrix { + sparse_matrix_t mtx; + + public: + inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} + + // Constructs MKL sparse matrix from KK sparse views (m rows x n cols) + inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols, + MKL_INT *xadj, MKL_INT *adj, value_type *values); + + // Allows using MKLSparseMatrix directly in MKL calls + inline operator sparse_matrix_t() const { return mtx; } + + // Exports MKL sparse matrix contents into KK views + inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, + MKL_INT *&rows_start, MKL_INT *&columns, + value_type *&values); + + inline void destroy() { MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); } +}; + +template <> +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + float *values) { + MKL_SAFE_CALL(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, + cols, xadj, xadj + 1, adj, values)); +} + +template <> +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + double *values) { + MKL_SAFE_CALL(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, + cols, xadj, xadj + 1, adj, values)); +} + +template <> +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + float *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + MKL_SAFE_CALL(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, + &values)); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +template <> +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + double *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + MKL_SAFE_CALL(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, + &rows_start, &rows_end, &columns, + &values)); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 50bf840e58..3044b2c576 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -57,84 +57,14 @@ namespace Impl { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +// multiplies two sparse MKL matrices and returns sparse MKL matrix template -class MKLSparseMatrix { - sparse_matrix_t mtx; - - public: - inline MKLSparseMatrix(const MKL_INT m, const MKL_INT n, MKL_INT *xadj, - MKL_INT *adj, value_type *values); - - inline static MKLSparseMatrix spmm( - sparse_operation_t operation, const MKLSparseMatrix &A, - const MKLSparseMatrix &B) { - sparse_matrix_t c; - MKL_SAFE_CALL(mkl_sparse_spmm(operation, A.mtx, B.mtx, &c)); - return MKLSparseMatrix(c); - } - - inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, - MKL_INT *&rows_start, MKL_INT *&columns, - value_type *&values); - - inline void destroy() { MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); } - - private: - inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} -}; - -template <> -inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, - const MKL_INT cols, - MKL_INT *xadj, MKL_INT *adj, - float *values) { - MKL_SAFE_CALL(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, - cols, xadj, xadj + 1, adj, values)); -} - -template <> -inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, - const MKL_INT cols, - MKL_INT *xadj, MKL_INT *adj, - double *values) { - MKL_SAFE_CALL(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, - cols, xadj, xadj + 1, adj, values)); -} - -template <> -inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, - MKL_INT &num_cols, - MKL_INT *&rows_start, - MKL_INT *&columns, - float *&values) { - sparse_index_base_t indexing; - MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, - &values)); - if (SPARSE_INDEX_BASE_ZERO != indexing) { - throw std::runtime_error( - "Expected zero based indexing in exported MKL sparse matrix\n"); - return; - } -} - -template <> -inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, - MKL_INT &num_cols, - MKL_INT *&rows_start, - MKL_INT *&columns, - double *&values) { - sparse_index_base_t indexing; - MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, - &values)); - if (SPARSE_INDEX_BASE_ZERO != indexing) { - throw std::runtime_error( - "Expected zero based indexing in exported MKL sparse matrix\n"); - return; - } +inline static MKLSparseMatrix mkl_spmm( + sparse_operation_t operation, const MKLSparseMatrix &A, + const MKLSparseMatrix &B) { + sparse_matrix_t C; + MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C)); + return MKLSparseMatrix(C); } template ::value) From 3339c8deae2f350c4a71ef831508d93e72cbf23c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 17 Feb 2022 14:56:49 +0100 Subject: [PATCH 037/261] Rename "apply" into "spmm" --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 3044b2c576..43b2b5081b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -72,7 +72,7 @@ template -class MKLApply { +class MKL_SPMM { public: typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; @@ -120,8 +120,8 @@ class MKLApply { Kokkos::ViewAllocateWithoutInitializing("tmp_valuesB"), entriesB.extent(0)); - apply(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB, - entriesB, tmp_valsB, transposeB, verbose, export_rowmap); + spmm(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB, + entriesB, tmp_valsB, transposeB, verbose, export_rowmap); if (verbose) std::cout << "MKL symbolic time:" << timer.seconds() << std::endl; @@ -150,8 +150,8 @@ class MKLApply { } }; - apply(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, - entriesB, valuesB, transposeB, verbose, export_values); + spmm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, + entriesB, valuesB, transposeB, verbose, export_values); if (verbose) std::cout << "MKL numeric time:" << timer.seconds() << std::endl; @@ -162,13 +162,13 @@ class MKLApply { private: template - static void apply(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n, - nnz_lno_t k, a_rowmap_view_type row_mapA, - a_index_view_type entriesA, a_values_view_type valuesA, + static void spmm(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n, + nnz_lno_t k, a_rowmap_view_type row_mapA, + a_index_view_type entriesA, a_values_view_type valuesA, - bool transposeA, b_rowmap_view_type row_mapB, - b_index_view_type entriesB, b_values_view_type valuesB, - bool transposeB, bool verbose, const CB &callback) { + bool transposeA, b_rowmap_view_type row_mapB, + b_index_view_type entriesB, b_values_view_type valuesB, + bool transposeB, bool verbose, const CB &callback) { if (!std::is_same::value) { throw std::runtime_error("MKL requires local ordinals to be integer.\n"); } @@ -303,7 +303,7 @@ void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, #else using values_type = typename KernelHandle::scalar_temp_work_view_t; using c_index_type = b_index_type; - using mkl = MKLApply; mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, @@ -341,7 +341,7 @@ void mkl_apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, (void)valuesC; (void)verbose; #else - using mkl = MKLApply; mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, From 8c8cbdf8b7cf6e508b7cd5f3587ff61f01e847de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 17 Feb 2022 14:59:14 +0100 Subject: [PATCH 038/261] Guard whole file with ENABLE_TPL_MKL --- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 43 +------------------ .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 4 ++ .../KokkosSparse_spgemm_symbolic_spec.hpp | 4 ++ 3 files changed, 10 insertions(+), 41 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 43b2b5081b..6c95e648e9 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -50,13 +50,10 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include "mkl_spblas.h" -#endif namespace KokkosSparse { namespace Impl { -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - // multiplies two sparse MKL matrices and returns sparse MKL matrix template inline static MKLSparseMatrix mkl_spmm( @@ -276,7 +273,6 @@ class MKL_SPMM { return view_type(data, num_elems); } }; -#endif // KOKKOSKERNELS_ENABLE_TPL_MKL template ; mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB, row_mapC, verbose); -#endif } template ; mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC, verbose); -#endif } } // namespace Impl } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL +#endif // _KOKKOSSPGEMMMKL_HPP diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index beb969fc77..68e5e82bdb 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -245,9 +245,13 @@ struct SPGEMM_NUMERIC< transposeB, row_mapC, entriesC, valuesC); break; case SPGEMM_MKL: +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL mkl_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC, handle->get_verbose()); +#else + throw std::runtime_error("MKL was not enabled in this build!"); +#endif break; case SPGEMM_MKL2PHASE: mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, diff --git a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 181984ebe9..d83ae6767c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -179,9 +179,13 @@ struct SPGEMM_SYMBOLICget_verbose()); break; +#else + throw std::runtime_error("MKL was not enabled in this build!"); +#endif } sh->set_call_symbolic(); } From 70bb051a5a42e3bf5395c60363bfba2cddc2f64f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 18 Feb 2022 13:39:17 +0100 Subject: [PATCH 039/261] Add explicit compilation error about scalar types not supported by MKL --- src/common/KokkosKernels_SparseUtils_mkl.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp index a2ab16fba9..780c75ea51 100644 --- a/src/common/KokkosKernels_SparseUtils_mkl.hpp +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -79,6 +79,14 @@ inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { "Invalid mode for MKL (should be one of N, T, H)"); } +template +struct mkl_is_supported_value_type : std::false_type {}; + +template <> +struct mkl_is_supported_value_type : std::true_type {}; +template <> +struct mkl_is_supported_value_type : std::true_type {}; + // MKLSparseMatrix provides thin wrapper around MKL matrix handle // (sparse_matrix_t) and encapsulates MKL call dispatches related to details // like value_type, allowing simple client code in kernels. @@ -86,6 +94,10 @@ template class MKLSparseMatrix { sparse_matrix_t mtx; + static_assert(mkl_is_supported_value_type::value, + "Scalar type used in MKLSparseMatrix is NOT " + "supported by MKL"); + public: inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} From 650cd176926ab306b586d5169114a398be65e1d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 18 Feb 2022 13:53:50 +0100 Subject: [PATCH 040/261] Add Kokkos::complex support to MKL sparse matrix --- src/common/KokkosKernels_SparseUtils_mkl.hpp | 54 ++++++++++++++++++++ unit_test/sparse/Test_Sparse_spgemm.hpp | 9 ++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp index 780c75ea51..3bd1deb96a 100644 --- a/src/common/KokkosKernels_SparseUtils_mkl.hpp +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -86,6 +86,10 @@ template <> struct mkl_is_supported_value_type : std::true_type {}; template <> struct mkl_is_supported_value_type : std::true_type {}; +template <> +struct mkl_is_supported_value_type> : std::true_type {}; +template <> +struct mkl_is_supported_value_type> : std::true_type {}; // MKLSparseMatrix provides thin wrapper around MKL matrix handle // (sparse_matrix_t) and encapsulates MKL call dispatches related to details @@ -134,6 +138,24 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, cols, xadj, xadj + 1, adj, values)); } +template <> +inline MKLSparseMatrix>::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + Kokkos::complex *values) { + MKL_SAFE_CALL(mkl_sparse_c_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, + reinterpret_cast(values))); +} + +template <> +inline MKLSparseMatrix>::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + Kokkos::complex *values) { + MKL_SAFE_CALL(mkl_sparse_z_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, + reinterpret_cast(values))); +} + template <> inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, MKL_INT &num_cols, @@ -170,6 +192,38 @@ inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, } } +template <> +inline void MKLSparseMatrix>::export_data( + MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, + MKL_INT *&columns, Kokkos::complex *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + MKL_SAFE_CALL(mkl_sparse_c_export_csr( + mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, + reinterpret_cast(&values))); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +template <> +inline void MKLSparseMatrix>::export_data( + MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, + MKL_INT *&columns, Kokkos::complex *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + MKL_SAFE_CALL(mkl_sparse_z_export_csr( + mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, + reinterpret_cast(&values))); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + } // namespace Impl } // namespace KokkosSparse diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index cb3d04b019..53158f85ed 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -299,13 +299,12 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, #endif break; - case SPGEMM_MKL: - algo = "SPGEMM_MKL"; - // MKL requires scalar to be either float or double - if (!(std::is_same::value || - std::is_same::value)) { + case SPGEMM_MKL: algo = "SPGEMM_MKL"; +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { is_expected_to_fail = true; } +#endif // mkl requires local ordinals to be int. if (!(std::is_same::value)) { is_expected_to_fail = true; From 35a4621faf80cf5534cd66a96ed505860fa44d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 18 Feb 2022 16:04:07 +0100 Subject: [PATCH 041/261] Adjust unit test tolerance for MKL float --- unit_test/sparse/Test_Sparse_spgemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index 53158f85ed..ab84b7b0a5 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -229,7 +229,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { typedef typename Kokkos::Details::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; - eps_type eps = std::is_same::value ? 2 * 1e-3 : 1e-7; + eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( From a972c7523998cf1d59d204361a8ea1bbfd7713d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 18 Feb 2022 16:06:09 +0100 Subject: [PATCH 042/261] Fix conversion compiler errors --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 6c95e648e9..36784731d0 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -214,8 +214,8 @@ class MKL_SPMM { auto h_valsB = create_mirror(valuesB); auto h_entriesA = create_mirror(entriesA); auto h_entriesB = create_mirror(entriesB); - const int *a_adj = h_entriesA.data(); - const int *b_adj = h_entriesB.data(); + const int *a_adj = reinterpret_cast(h_entriesA.data()); + const int *b_adj = reinterpret_cast(h_entriesB.data()); const value_type *a_ew = h_valsA.data(); const value_type *b_ew = h_valsB.data(); From 9d4de666b81b6721142397f7b27ca9aead795dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 18 Feb 2022 17:51:05 +0100 Subject: [PATCH 043/261] Fix expected crashes for ordinal_type!=int in unit test --- src/sparse/KokkosSparse_spgemm_numeric.hpp | 4 +++- unit_test/sparse/Test_Sparse_spgemm.hpp | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_spgemm_numeric.hpp b/src/sparse/KokkosSparse_spgemm_numeric.hpp index 60a54f5b8b..5bc791397c 100644 --- a/src/sparse/KokkosSparse_spgemm_numeric.hpp +++ b/src/sparse/KokkosSparse_spgemm_numeric.hpp @@ -139,7 +139,9 @@ void spgemm_numeric(KernelHandle *handle, "If you need this case please let kokkos-kernels developers know.\n"); } - if (m < 1 || n < 1 || k < 1) return; + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) + return; typedef typename KernelHandle::const_size_type c_size_t; typedef typename KernelHandle::const_nnz_lno_t c_lno_t; diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index ab84b7b0a5..47b06b716a 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -269,6 +269,8 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( k, n, nnz, row_size_variance, bandwidth); + const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; + crsMat_t output_mat2; if (oldInterface) run_spgemm_old_interface(A, B, SPGEMM_DEBUG, output_mat2); @@ -305,8 +307,9 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, is_expected_to_fail = true; } #endif - // mkl requires local ordinals to be int. - if (!(std::is_same::value)) { + // MKL requires local ordinals to be int. + // Note: empty-array special case will NOT fail on this. + if (!std::is_same::value && !is_empy_case) { is_expected_to_fail = true; } // if size_type is larger than int, mkl casts it to int. @@ -345,7 +348,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what(); failed = true; } - EXPECT_TRUE((failed == is_expected_to_fail)); + EXPECT_EQ(is_expected_to_fail, failed); // double spgemm_time = timer1.seconds(); From 9e42209e41045194aeb5304197f16139e2db7fa4 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Mon, 7 Mar 2022 14:19:07 -0700 Subject: [PATCH 044/261] A couple newer sparse tests were not following the new testing pattern --- .../sparse/Test_Sparse_spmv_blockcrs.hpp | 241 ++---------------- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 240 ++--------------- 2 files changed, 38 insertions(+), 443 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index c30923a5bf..f775e4890d 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -500,229 +500,26 @@ void testBlockCrsMatrix_SpM_MV() { DEVICE>(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, - LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif -#endif + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_BCRS_TIMES_MVEC_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 25b44b4e7e..73f5d103bd 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -609,228 +609,26 @@ void testBsrMatrix_SpM_MV() { testBsrMatrix_SpM_MV(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutLeft, - TestExecSpace) -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, LayoutRight, - TestExecSpace) -#endif -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif -#endif + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_BSR_TIMES_MVEC_TEST From e4f146a85965b461d3b1a8efaa7539efd51d2bf7 Mon Sep 17 00:00:00 2001 From: James Foucar Date: Tue, 8 Mar 2022 10:09:01 -0700 Subject: [PATCH 045/261] clang formatting --- unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp | 8 +++++--- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 6 ++++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index f775e4890d..a96af6973e 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -503,7 +503,8 @@ void testBlockCrsMatrix_SpM_MV() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ + TestExecSpace) #include @@ -513,8 +514,9 @@ void testBlockCrsMatrix_SpM_MV() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ + TestExecSpace) #include diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 73f5d103bd..344a203567 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -612,7 +612,8 @@ void testBsrMatrix_SpM_MV() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ + TestExecSpace) #include @@ -623,7 +624,8 @@ void testBsrMatrix_SpM_MV() { #if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ + TestExecSpace) #include From 80a0cb5f1b59e42f1a0d0bba3cb9bb8b0c9f9b55 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 8 Mar 2022 13:40:03 -0700 Subject: [PATCH 046/261] cm_ scripts: Pthread -> Threads --- cm_generate_makefile.bash | 6 ++--- scripts/cm_test_all_sandia | 46 +++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index b26ba7be97..043dcc2196 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -230,7 +230,7 @@ display_help_text() { echo "--with-openmptarget: Enable OpenMPTarget backend." echo "--with-sycl: Enable Sycl backend." echo "--with-openmp: Enable OpenMP backend." - echo "--with-pthread: Enable Pthreads backend." + echo "--with-threads: Enable Threads backend." echo "--with-serial: Enable Serial backend." echo "--with-devices: Explicitly add a set of backends." echo "" @@ -415,8 +415,8 @@ do --with-sycl) update_kokkos_devices Sycl ;; - --with-pthread) - update_kokkos_devices Pthread + --with-threads) + update_kokkos_devices Threads ;; --with-serial) update_kokkos_devices Serial diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index c049e6b721..1f8ee5ed51 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -50,8 +50,8 @@ print_help() { echo "--build-list=BUILD,BUILD,BUILD..." echo " Provide a comma-separated list of builds instead of running all builds" echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" + echo " OpenMP, Threads, Serial, OpenMP_Serial, Threads_Serial" + echo " Cuda_OpenMP, Cuda_Threads, Cuda_Serial" echo "" echo "--with-scalars=SCALARS: set KOKKOSKERNELS_SCALARS" echo " Provide a comma-separated list scalar types" @@ -183,12 +183,12 @@ fi echo "Running on machine: $MACHINE" -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +GCC_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +INTEL_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" +CLANG_BUILD_LIST="Threads,Serial,Threads_Serial" +CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Threads,Cuda_Serial" CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" @@ -526,7 +526,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -535,7 +535,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -620,7 +620,7 @@ elif [ "$MACHINE" = "white" ]; then CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" IBM_MODULE_TPL_LIST="cmake/3.19.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" - # Don't do pthread on white. + # Don't do Threads on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" # Don't run the IBM toolchain with CXX14 on white @@ -678,7 +678,7 @@ elif [ "$MACHINE" = "weaver" ]; then # "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - # Don't do pthread on weaver + # Don't do Threads on weaver GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" if [ "$SPOT_CHECK" = "True" ]; then @@ -789,14 +789,14 @@ elif [ "$MACHINE" = "blake" ]; then #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" + "clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else @@ -845,36 +845,36 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,/,cuda/10.0" - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Threads" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + BUILD_LIST_CLANG="Serial,Threads,OpenMP" CLANG8_CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-Wno-pass-failed" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" + "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" + "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) From 373d309768dcb90aad259d291a4215e5d085ac50 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Mar 2022 17:02:27 -0700 Subject: [PATCH 047/261] perf_test/batched: Temporarily disable tests --- perf_test/batched/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt index 36435ecfc1..d044cf021f 100644 --- a/perf_test/batched/CMakeLists.txt +++ b/perf_test/batched/CMakeLists.txt @@ -1,9 +1,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag - SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp -) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi - SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp -) +#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag +# SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp +#) +#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi +# SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp +#) From cc13270949f6504cae13b8f64fe9eee66e7424a9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 11 Mar 2022 10:47:52 -0700 Subject: [PATCH 048/261] GEMV: accumulate in float for scalar = bhalf_t (same change that was done in #1082 for scalar = half_t) This improves answer accuracy and also performance on GPU, since there isn't an atomic_add for these types but there is for float. --- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 6f27363be9..a16a9eaf9a 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -64,8 +64,9 @@ struct SingleLevelNontransposeGEMV { using BetaCoeffType = typename YViewType::non_const_value_type; using y_value_type = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -146,8 +147,9 @@ struct SingleLevelTransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; typedef AccumScalar value_type[]; IndexType value_count; // Kokkos needs this for reductions w/ array results @@ -479,8 +481,9 @@ struct TwoLevelGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -600,8 +603,9 @@ struct TwoLevelTransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -739,7 +743,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space, tagged_policy team; if (isLayoutLeft) { using AccumScalar = typename std::conditional< - std::is_same::value, + std::is_same::value || + std::is_same::value, float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; From ec6cf576feeae05ff40933f81940a588aa1e2845 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 14 Mar 2022 13:23:43 -0600 Subject: [PATCH 049/261] Only instantiate Kokkos's default Cuda mem space Instead of instantiating for both Cuda,CudaSpace and Cuda,CudaUVMSpace by default, just instantiate for the Kokkos's default mem space (Cuda::memory_space), which is controlled by Kokkos_ENABLE_CUDA_UVM. --- cmake/kokkoskernels_eti_devices.cmake | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index 47dce1f9d1..9395cec564 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -41,19 +41,29 @@ SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_CUDA - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ON BOOL "Whether to pre instantiate kernels for the execution space Kokkos::Cuda. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) + + # By default, instantiate only for Cuda's default memory space (either CudaSpace, or CudaUVMSpace). + IF(KOKKOS_ENABLE_CUDA_UVM) + SET(CUDA_CUDAUVMSPACE_DEFAULT ON) + SET(CUDA_CUDASPACE_DEFAULT OFF) + ELSE() + SET(CUDA_CUDAUVMSPACE_DEFAULT OFF) + SET(CUDA_CUDASPACE_DEFAULT ON) + ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDAUVMSPACE - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ${CUDA_CUDAUVMSPACE_DEFAULT} BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDASPACE - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ${CUDA_CUDASPACE_DEFAULT} BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) From a94ac9fdfe541a46aac1b18805116c0a3ced290d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 15 Mar 2022 13:34:24 -0600 Subject: [PATCH 050/261] Disable offset=int by default (Make size_t the only default). int can still be enabled with KokkosKernels_INST_OFFSET_INT=ON. --- cmake/kokkoskernels_eti_offsets.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/kokkoskernels_eti_offsets.cmake b/cmake/kokkoskernels_eti_offsets.cmake index 171223010c..484175a976 100644 --- a/cmake/kokkoskernels_eti_offsets.cmake +++ b/cmake/kokkoskernels_eti_offsets.cmake @@ -1,5 +1,5 @@ SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) -SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) +SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT OFF) SET(OFFSETS OFFSET_INT OFFSET_SIZE_T @@ -12,14 +12,14 @@ KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_INT ${KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=ON by default. Default: ON" + "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=OFF by default. Default: OFF" ) KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_SIZE_T ${KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=OFF by default. Default: ON" + "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=ON by default. Default: ON" ) IF (KOKKOSKERNELS_INST_OFFSET_INT) From 89c58decdcc4de1cc7509977dea235338582a116 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 17 Mar 2022 14:51:03 -0600 Subject: [PATCH 051/261] .github/workflows: Always enable int and size_t offsets --- .github/workflows/osx.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index ffdc484346..e4e5a33719 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -73,6 +73,8 @@ jobs: -DKokkosKernels_INST_FLOAT=ON \ -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \ -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ .. From 4e2ed7d6fce42155ce1322a42d1edd89dce50e6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 21 Mar 2022 22:26:11 +0100 Subject: [PATCH 052/261] Rename mkl_apply() to mkl_numeric() --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 12 ++++++------ src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 36784731d0..d0c2172d4a 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -296,12 +296,12 @@ template -void mkl_apply(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, - a_rowmap_type row_mapA, a_index_type entriesA, - a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB, - b_index_type entriesB, b_values_type valuesB, bool transposeB, - c_rowmap_type row_mapC, c_index_type entriesC, - c_values_type valuesC, bool verbose = false) { +void mkl_numeric(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_type row_mapA, a_index_type entriesA, + a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB, + b_index_type entriesB, b_values_type valuesB, bool transposeB, + c_rowmap_type row_mapC, c_index_type entriesC, + c_values_type valuesC, bool verbose = false) { using mkl = MKL_SPMM; diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 68e5e82bdb..0b28d2f02b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -246,9 +246,9 @@ struct SPGEMM_NUMERIC< break; case SPGEMM_MKL: #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - mkl_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, - row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, - valuesC, handle->get_verbose()); + mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, + valuesC, handle->get_verbose()); #else throw std::runtime_error("MKL was not enabled in this build!"); #endif From 6344604df5a70d37b04b5ac381fb430cf42d2d85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 21 Mar 2022 23:36:58 +0100 Subject: [PATCH 053/261] Rename MKL_SAFE_CALL() to KOKKOSKERNELS_MKL_SAFE_CALL() --- perf_test/sparse/KokkosSparse_spadd.cpp | 14 +++--- src/common/KokkosKernels_SparseUtils_mkl.hpp | 34 +++++++------ ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 50 ++++++++++--------- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 26 +++++----- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 2 +- 5 files changed, 66 insertions(+), 60 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 49034930e6..de8b5fcca8 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -245,11 +245,11 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - MKL_SAFE_CALL(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())); - MKL_SAFE_CALL(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); @@ -312,9 +312,9 @@ void run_experiment(const Params& params) { #endif } else if (params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, - 1.0, Bmkl, &Cmkl)); - MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( + SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); #endif } else { spadd_numeric( @@ -337,8 +337,8 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if (params.use_mkl) { - MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); - MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); } #endif diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp index 3bd1deb96a..80f9426134 100644 --- a/src/common/KokkosKernels_SparseUtils_mkl.hpp +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -65,7 +65,7 @@ inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name, } } -#define MKL_SAFE_CALL(call) \ +#define KOKKOSKERNELS_MKL_SAFE_CALL(call) \ KokkosSparse::Impl::mkl_internal_safe_call(call, #call, __FILE__, __LINE__) inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { @@ -117,7 +117,9 @@ class MKLSparseMatrix { MKL_INT *&rows_start, MKL_INT *&columns, value_type *&values); - inline void destroy() { MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); } + inline void destroy() { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); + } }; template <> @@ -125,8 +127,8 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, float *values) { - MKL_SAFE_CALL(mkl_sparse_s_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, - cols, xadj, xadj + 1, adj, values)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values)); } template <> @@ -134,15 +136,15 @@ inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, double *values) { - MKL_SAFE_CALL(mkl_sparse_d_create_csr(&mtx, SPARSE_INDEX_BASE_ZERO, rows, - cols, xadj, xadj + 1, adj, values)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values)); } template <> inline MKLSparseMatrix>::MKLSparseMatrix( const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, Kokkos::complex *values) { - MKL_SAFE_CALL(mkl_sparse_c_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, reinterpret_cast(values))); } @@ -151,7 +153,7 @@ template <> inline MKLSparseMatrix>::MKLSparseMatrix( const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, Kokkos::complex *values) { - MKL_SAFE_CALL(mkl_sparse_z_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, reinterpret_cast(values))); } @@ -164,9 +166,9 @@ inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, float *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, - &values)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start, + &rows_end, &columns, &values)); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( "Expected zero based indexing in exported MKL sparse matrix\n"); @@ -182,9 +184,9 @@ inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, double *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, - &rows_start, &rows_end, &columns, - &values)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start, + &rows_end, &columns, &values)); if (SPARSE_INDEX_BASE_ZERO != indexing) { throw std::runtime_error( "Expected zero based indexing in exported MKL sparse matrix\n"); @@ -198,7 +200,7 @@ inline void MKLSparseMatrix>::export_data( MKL_INT *&columns, Kokkos::complex *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_c_export_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_export_csr( mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, reinterpret_cast(&values))); if (SPARSE_INDEX_BASE_ZERO != indexing) { @@ -214,7 +216,7 @@ inline void MKLSparseMatrix>::export_data( MKL_INT *&columns, Kokkos::complex *&values) { sparse_index_base_t indexing; MKL_INT *rows_end; - MKL_SAFE_CALL(mkl_sparse_z_export_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_export_csr( mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, reinterpret_cast(&values))); if (SPARSE_INDEX_BASE_ZERO != indexing) { diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index d3c15e0267..6ef47f8008 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -73,13 +73,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, const int* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, @@ -88,13 +89,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, double* y) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, @@ -105,7 +107,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); @@ -113,9 +115,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spmv_block_impl_mkl(sparse_operation_t op, @@ -126,7 +128,7 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); @@ -134,9 +136,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, matrix_descr A_descr = getDescription(); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, @@ -145,15 +147,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, const float* Avalues, const float* x, int colx, int ldx, float* y, int ldy) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_s_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, - ldy)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, + ldx, beta, y, ldy)); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, @@ -162,15 +164,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_d_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, - ldy)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, + ldx, beta, y, ldy)); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, @@ -182,7 +184,7 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_c_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); @@ -190,7 +192,7 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); matrix_descr A_descr = getDescription(); - MKL_SAFE_CALL( + KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); @@ -203,7 +205,7 @@ inline void spm_mv_block_impl_mkl( const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - MKL_SAFE_CALL(mkl_sparse_z_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); @@ -211,7 +213,7 @@ inline void spm_mv_block_impl_mkl( matrix_descr A_descr = getDescription(); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - MKL_SAFE_CALL( + KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index bacc749840..ebd6ce8993 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -546,11 +546,12 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - MKL_SAFE_CALL(mkl_sparse_s_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - MKL_SAFE_CALL(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, @@ -561,11 +562,12 @@ inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - MKL_SAFE_CALL(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - MKL_SAFE_CALL(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, @@ -579,15 +581,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - MKL_SAFE_CALL(mkl_sparse_c_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); MKL_Complex8& beta_mkl = reinterpret_cast(beta); - MKL_SAFE_CALL(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, @@ -601,15 +603,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - MKL_SAFE_CALL(mkl_sparse_z_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); MKL_Complex16& beta_mkl = reinterpret_cast(beta); - MKL_SAFE_CALL(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index d0c2172d4a..7ac10b4226 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -60,7 +60,7 @@ inline static MKLSparseMatrix mkl_spmm( sparse_operation_t operation, const MKLSparseMatrix &A, const MKLSparseMatrix &B) { sparse_matrix_t C; - MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C)); return MKLSparseMatrix(C); } From 5b6cce474d2b233df2a05755c32694aa4fa45ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 22 Mar 2022 00:01:47 +0100 Subject: [PATCH 054/261] Use INT_MAX --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 7ac10b4226..64187e2a0b 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -154,9 +154,6 @@ class MKL_SPMM { std::cout << "MKL numeric time:" << timer.seconds() << std::endl; } - private: - static constexpr int max_integer = 2147483647; - private: template static void spmm(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n, @@ -186,8 +183,7 @@ class MKL_SPMM { int_tmp_view_t a_xadj_v, b_xadj_v; if (!std::is_same::value) { - if (entriesA.extent(0) > max_integer || - entriesB.extent(0) > max_integer) { + if (entriesA.extent(0) > INT_MAX || entriesB.extent(0) > INT_MAX) { throw std::runtime_error( "MKL requires integer values for size type for SPGEMM. Copying " "to " From de3891cb26b7e21a42a014772edf88b42b644fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 22 Mar 2022 00:11:44 +0100 Subject: [PATCH 055/261] Rename MKL_SPMM to MKL_SPGEMM --- src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 64187e2a0b..d1bfb3db5c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -69,7 +69,7 @@ template -class MKL_SPMM { +class MKL_SPGEMM { public: typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; @@ -280,9 +280,9 @@ void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, c_rowmap_type row_mapC, bool verbose = false) { using values_type = typename KernelHandle::scalar_temp_work_view_t; using c_index_type = b_index_type; - using mkl = MKL_SPMM; + using mkl = MKL_SPGEMM; mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, entriesB, transposeB, row_mapC, verbose); } @@ -298,9 +298,10 @@ void mkl_numeric(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, b_index_type entriesB, b_values_type valuesB, bool transposeB, c_rowmap_type row_mapC, c_index_type entriesC, c_values_type valuesC, bool verbose = false) { - using mkl = MKL_SPMM; + using mkl = + MKL_SPGEMM; mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC, verbose); From ea6da8d81de7a61fec7ada44ae411321f162560e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 22 Mar 2022 00:30:26 +0100 Subject: [PATCH 056/261] Explain MKL error --- src/common/KokkosKernels_SparseUtils_mkl.hpp | 29 ++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/common/KokkosKernels_SparseUtils_mkl.hpp index 80f9426134..b9eb3a9bd2 100644 --- a/src/common/KokkosKernels_SparseUtils_mkl.hpp +++ b/src/common/KokkosKernels_SparseUtils_mkl.hpp @@ -59,8 +59,33 @@ inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name, const int line = 0) { if (SPARSE_STATUS_SUCCESS != mkl_status) { std::ostringstream oss; - oss << "MKL call \"" << name << "\" encountered error at " << file << ":" - << line << '\n'; + oss << "MKL call \"" << name << "\" at " << file << ":" << line + << " encountered error: "; + switch (mkl_status) { + case SPARSE_STATUS_NOT_INITIALIZED: + oss << "SPARSE_STATUS_NOT_INITIALIZED (empty handle or matrix arrays)"; + break; + case SPARSE_STATUS_ALLOC_FAILED: + oss << "SPARSE_STATUS_ALLOC_FAILED (internal error: memory allocation " + "failed)"; + break; + case SPARSE_STATUS_INVALID_VALUE: + oss << "SPARSE_STATUS_INVALID_VALUE (invalid input value)"; + break; + case SPARSE_STATUS_EXECUTION_FAILED: + oss << "SPARSE_STATUS_EXECUTION_FAILED (e.g. 0-diagonal element for " + "triangular solver)"; + break; + case SPARSE_STATUS_INTERNAL_ERROR: + oss << "SPARSE_STATUS_INTERNAL_ERROR"; + break; + case SPARSE_STATUS_NOT_SUPPORTED: + oss << "SPARSE_STATUS_NOT_SUPPORTED (e.g. operation for double " + "precision doesn't support other types)"; + break; + default: oss << "unknown (code " << (int)mkl_status << ")"; break; + } + oss << '\n'; Kokkos::abort(oss.str().c_str()); } } From c9d8d37a7e36d7ae010043f5aecebc8dc3acd166 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 17 Feb 2022 14:38:59 -0700 Subject: [PATCH 057/261] unit_test/sparse: Added common conversion test code - Added RandCscMat to KokkosKernels_TestUtils - Added Test_Sparse_TestUtils_RandCscMat --- .../impl/KokkosGraph_Distance1Color_impl.hpp | 1 + src/sparse/KokkosSparse_csc2csr.hpp | 48 ++++++++ test_common/KokkosKernels_TestUtils.hpp | 108 ++++++++++++++++++ unit_test/sparse/Test_Sparse.hpp | 2 + .../Test_Sparse_TestUtils_RandCscMat.hpp | 106 +++++++++++++++++ unit_test/sparse/Test_Sparse_csc2csr.hpp | 53 +++++++++ 6 files changed, 318 insertions(+) create mode 100644 src/sparse/KokkosSparse_csc2csr.hpp create mode 100644 unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp create mode 100644 unit_test/sparse/Test_Sparse_csc2csr.hpp diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 39e27795cc..1e2433def8 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -417,6 +417,7 @@ class GraphColor_VB double total_time_serial_conflict_resolution = 0.0; Kokkos::Timer timer; timer.reset(); + (void)total; int iter = 0; for (; (iter < this->_max_num_iterations) && (numUncolored > 0); iter++) { diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp new file mode 100644 index 0000000000..bd4ade4b5b --- /dev/null +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -0,0 +1,48 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSSPARSE_CSC2CSR_HPP +#define _KOKKOSSPARSE_CSC2CSR_HPP +// TODO +#endif // _KOKKOSSPARSE_CSC2CSR_HPP \ No newline at end of file diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 2878543f33..327847b7c1 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -45,6 +45,8 @@ #ifndef KOKKOSKERNELS_TEST_UTILS_HPP #define KOKKOSKERNELS_TEST_UTILS_HPP +#include + #include "KokkosKernels_Utils.hpp" #include "Kokkos_ArithTraits.hpp" #include "KokkosSparse_spmv.hpp" @@ -127,6 +129,8 @@ static inline const std::string kk_failure_str(std::string file, std::string func, const int line) { std::string failure_msg = " > from "; + // std::string test = + // ::testing::UnitTest::GetInstance()->current_test_info()->name(); failure_msg += (file + ":" + func + ":" + std::to_string(line) + "\n > "); return std::string(failure_msg); } @@ -488,5 +492,109 @@ int string_compare_no_case(const char* str1, const char* str2) { return strcmp(str1_s.c_str(), str2_s.c_str()); } +/// /brief Csc matrix class for testing purposes. +/// \tparam ScalarType +/// \tparam LayoutType +/// \tparam ExeSpaceType +template +class RandCscMat { + private: + using ValViewType = Kokkos::View; + using RowIdViewType = Kokkos::View; + using ColMapViewType = Kokkos::View; + size_t __nrows; + size_t __ncols; + size_t __nnz = 0; + ColMapViewType __col_map; + RowIdViewType __row_ids; + ValViewType __vals; + + /// Generates a random column map where: + /// 1. __col_map(i) is in [__row_ids.data(), &row_ids.data()[nnz - 1] + /// 2. __col_map(i) > col_map(i - 1) for i > 1 + /// 3. __col_map(i) == col_map(j) iff __col_map(i) == col_map(j) == nullptr + /// 4. __col_map(i) - col_map(i - 1) is in [0, m] + void __populate_random_csc_mat(uint64_t ticks) { + std::srand(ticks); + for (size_t col_idx = 0; col_idx < __ncols; col_idx++) { + size_t r = std::rand() % (__nrows + 1); + if (r == 0) { // 100% sparse column + __col_map(col_idx) = __nnz; + } else { // sparse column with r elements + // Populate r row ids + std::vector v(r); + + for (size_t i = 0; i < r; i++) v.at(i) = i; + + std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()())); + + for (size_t i = 0; i < r; i++) __row_ids(i + __nnz) = v.at(i); + + // Point to new column and accumulate number of non zeros + __col_map(col_idx) = __nnz; + __nnz += r; + } + } + + // last entry in map points to end of row id list + __col_map(__ncols) = __nnz; + } + + template + T __getter_copy_helper(T src) { + T dst(std::string("RandCscMat.") + typeid(T).name() + " copy", + src.extent(0)); + Kokkos::deep_copy(dst, src); + ExeSpaceType().fence(); + return dst; + } + + public: + std::string info; + /// Constructs a random csc matrix. + /// \param m The number of rows. + /// \param n The number of columns. + /// \param min_val The minimum scalar value in the matrix. + /// \param max_val The maximum scalar value in the matrix. + RandCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + __ncols = n; + __nrows = m; + __col_map = ColMapViewType("RandCscMat.ColMapViewType", __ncols + 1); + __row_ids = + RowIdViewType("RandCscMat.RowIdViewType", m * n + 1); // over-allocated + + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + + info = std::string(std::string("RandCscMat<") + typeid(ScalarType).name() + + ", " + typeid(LayoutType).name() + ", " + + typeid(ExeSpaceType).name() + ">(" + std::to_string(m) + + ", " + std::to_string(n) + + "...): rand seed: " + std::to_string(ticks) + "\n"); + Kokkos::Random_XorShift64_Pool random(ticks); + __populate_random_csc_mat(ticks); + + __vals = ValViewType("RandCscMat.ValViewType", __nnz + 1); + Kokkos::fill_random(__vals, random, min_val, max_val); // random scalars + ExeSpaceType().fence(); + __vals(__nnz) = ScalarType(0); + } + + // O(c), where c is a constant. + ScalarType operator()(size_t idx) { return __vals(idx); } + + size_t get_nnz() { return __nnz; } + size_t get_m() { return __nrows; } + size_t get_n() { return __ncols; } + size_t get_col_len(size_t j) { + return j < __ncols ? (__col_map(j + 1) - __col_map(j)) : 0; + } + size_t get_col_start(size_t j) { return j < __ncols ? __col_map(j) : 0; } + ValViewType get_vals() { return __getter_copy_helper(__vals); } + RowIdViewType get_row_ids() { return __getter_copy_helper(__row_ids); } + ColMapViewType get_col_map() { return __getter_copy_helper(__col_map); } +}; + } // namespace Test #endif diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 30639512c5..684b6855f2 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -18,6 +18,8 @@ #include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" +#include "Test_Sparse_TestUtils_RandCscMat.hpp" +#include "Test_Sparse_csc2csr.hpp" // TPL specific tests, these require // particular pairs of backend and TPL diff --git a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp new file mode 100644 index 0000000000..e56cd4bb40 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp @@ -0,0 +1,106 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void doCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + auto expected_min = ScalarType(1.0); + size_t expected_nnz = 0; + RandCscMat cm(m, n, min_val, max_val); + + std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); + for (size_t i = 0; i < cm.get_nnz(); ++i) + ASSERT_GE(cm(i), expected_min) << cm.info; + + for (size_t j = 0; j < cm.get_n(); ++j) { + for (size_t i = 0; i < cm.get_col_len(j); ++i) + ASSERT_FLOAT_EQ(cm(cm.get_col_start(j) + i), cm(expected_nnz + i)) + << cm.info; + expected_nnz += cm.get_col_len(j); + } + ASSERT_EQ(cm.get_nnz(), expected_nnz) << cm.info; + + // No need to check data here. Kokkos unit-tests deep_copy. + auto vals = cm.get_vals(); + ASSERT_EQ(vals.extent(0), cm.get_nnz() + 1) << cm.info; + + auto row_ids = cm.get_row_ids(); + ASSERT_EQ(row_ids.extent(0), cm.get_n() * cm.get_m() + 1) << cm.info; + + auto col_map = cm.get_col_map(); + ASSERT_EQ(col_map.extent(0), cm.get_n() + 1); +} + +template +void doAllCscMat(size_t m, size_t n) { + int min = 1, max = 10; + + // Verify that CscMax is constructed properly. + doCscMat(m, n, min, max); + doCscMat(m, n, min, max); + + doCscMat(m, n, min, max); + doCscMat(m, n, min, max); + + // Verify that CscMax can be instantiated with complex types. + RandCscMat, Kokkos::LayoutLeft, ExeSpaceType> cmcf( + m, n, min, max); + RandCscMat, Kokkos::LayoutRight, ExeSpaceType> cmcd( + m, n, min, max); +} + +// Test randomly generated csc matrices +TEST_F(TestCategory, sparse_randcscmat) { + // Square cases + for (int dim = 1; dim < 1024; dim *= 4) doAllCscMat(dim, dim); + + // Non-square cases + for (int dim = 1; dim < 1024; dim *= 4) { + doAllCscMat(dim * 3, dim); + doAllCscMat(dim, dim * 3); + } +} +} // namespace Test \ No newline at end of file diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp new file mode 100644 index 0000000000..f6f5033dbe --- /dev/null +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosSparse_csc2csr.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +// template +// void doCsc2Csr() { +// TODO +// } +} // namespace Test \ No newline at end of file From cab5d252c935f66813021f0ea15ed291e1936444 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 2 Mar 2022 13:42:41 -0700 Subject: [PATCH 058/261] src/sparse: - Add boilerplate code for construction a crs matrix from views in csc2csr unit_test/sparse: - Update RandCscMat to use int64_t rather than size_t due to signed ordinal type requirement in CrsMatrix. - Added initial csc2csr test. --- src/sparse/KokkosSparse_csc2csr.hpp | 30 +++++++++++- test_common/KokkosKernels_TestUtils.hpp | 34 +++++++------- .../Test_Sparse_TestUtils_RandCscMat.hpp | 10 ++-- unit_test/sparse/Test_Sparse_csc2csr.hpp | 47 +++++++++++++++++-- 4 files changed, 94 insertions(+), 27 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index bd4ade4b5b..558f362568 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -44,5 +44,33 @@ #ifndef _KOKKOSSPARSE_CSC2CSR_HPP #define _KOKKOSSPARSE_CSC2CSR_HPP -// TODO +namespace KokkosSparse { +template +auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, RowIdViewType row_ids, ColMapViewType col_map) { + using CrsST = typename ValViewType::value_type; + using CrsOT = OrdinalType; + using CrsDT = typename ValViewType::execution_space; + using CrsMT = void; + using CrsSzT = SizeType; + using CrsType = CrsMatrix; + using CrsValsViewType = typename CrsType::values_type; + using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; + using CrsColIdViewType = typename CrsType::index_type; + + CrsValsViewType crs_vals( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr vals"), nnz); + CrsRowMapViewType crs_row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr row_map"), + nrows + 1); + CrsColIdViewType crs_col_ids( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr col_ids"), nnz); + + // TODO: populate crs views + + return CrsType("csc2csr", nrows, ncols, nnz, crs_vals, crs_row_map, + crs_col_ids); +} +} // namespace KokkosSparse #endif // _KOKKOSSPARSE_CSC2CSR_HPP \ No newline at end of file diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 327847b7c1..f5009154a6 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -500,11 +500,11 @@ template class RandCscMat { private: using ValViewType = Kokkos::View; - using RowIdViewType = Kokkos::View; - using ColMapViewType = Kokkos::View; - size_t __nrows; - size_t __ncols; - size_t __nnz = 0; + using RowIdViewType = Kokkos::View; + using ColMapViewType = Kokkos::View; + int64_t __nrows; + int64_t __ncols; + int64_t __nnz = 0; ColMapViewType __col_map; RowIdViewType __row_ids; ValViewType __vals; @@ -516,19 +516,19 @@ class RandCscMat { /// 4. __col_map(i) - col_map(i - 1) is in [0, m] void __populate_random_csc_mat(uint64_t ticks) { std::srand(ticks); - for (size_t col_idx = 0; col_idx < __ncols; col_idx++) { - size_t r = std::rand() % (__nrows + 1); + for (int64_t col_idx = 0; col_idx < __ncols; col_idx++) { + int64_t r = std::rand() % (__nrows + 1); if (r == 0) { // 100% sparse column __col_map(col_idx) = __nnz; } else { // sparse column with r elements // Populate r row ids - std::vector v(r); + std::vector v(r); - for (size_t i = 0; i < r; i++) v.at(i) = i; + for (int64_t i = 0; i < r; i++) v.at(i) = i; std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()())); - for (size_t i = 0; i < r; i++) __row_ids(i + __nnz) = v.at(i); + for (int64_t i = 0; i < r; i++) __row_ids(i + __nnz) = v.at(i); // Point to new column and accumulate number of non zeros __col_map(col_idx) = __nnz; @@ -556,7 +556,7 @@ class RandCscMat { /// \param n The number of columns. /// \param min_val The minimum scalar value in the matrix. /// \param max_val The maximum scalar value in the matrix. - RandCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + RandCscMat(int64_t m, int64_t n, ScalarType min_val, ScalarType max_val) { __ncols = n; __nrows = m; __col_map = ColMapViewType("RandCscMat.ColMapViewType", __ncols + 1); @@ -582,15 +582,15 @@ class RandCscMat { } // O(c), where c is a constant. - ScalarType operator()(size_t idx) { return __vals(idx); } + ScalarType operator()(int64_t idx) { return __vals(idx); } - size_t get_nnz() { return __nnz; } - size_t get_m() { return __nrows; } - size_t get_n() { return __ncols; } - size_t get_col_len(size_t j) { + int64_t get_nnz() { return __nnz; } + int64_t get_m() { return __nrows; } + int64_t get_n() { return __ncols; } + int64_t get_col_len(int64_t j) { return j < __ncols ? (__col_map(j + 1) - __col_map(j)) : 0; } - size_t get_col_start(size_t j) { return j < __ncols ? __col_map(j) : 0; } + int64_t get_col_start(int64_t j) { return j < __ncols ? __col_map(j) : 0; } ValViewType get_vals() { return __getter_copy_helper(__vals); } RowIdViewType get_row_ids() { return __getter_copy_helper(__row_ids); } ColMapViewType get_col_map() { return __getter_copy_helper(__col_map); } diff --git a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp index e56cd4bb40..1d2589be21 100644 --- a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp +++ b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp @@ -47,16 +47,16 @@ namespace Test { template void doCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - auto expected_min = ScalarType(1.0); - size_t expected_nnz = 0; + auto expected_min = ScalarType(1.0); + int64_t expected_nnz = 0; RandCscMat cm(m, n, min_val, max_val); std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); - for (size_t i = 0; i < cm.get_nnz(); ++i) + for (int64_t i = 0; i < cm.get_nnz(); ++i) ASSERT_GE(cm(i), expected_min) << cm.info; - for (size_t j = 0; j < cm.get_n(); ++j) { - for (size_t i = 0; i < cm.get_col_len(j); ++i) + for (int64_t j = 0; j < cm.get_n(); ++j) { + for (int64_t i = 0; i < cm.get_col_len(j); ++i) ASSERT_FLOAT_EQ(cm(cm.get_col_start(j) + i), cm(expected_nnz + i)) << cm.info; expected_nnz += cm.get_col_len(j); diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp index f6f5033dbe..0633be7312 100644 --- a/unit_test/sparse/Test_Sparse_csc2csr.hpp +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -46,8 +46,47 @@ #include "KokkosKernels_TestUtils.hpp" namespace Test { -// template -// void doCsc2Csr() { -// TODO -// } +template +void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + RandCscMat cscMat(m, n, min_val, + max_val); + + auto csrMat = KokkosSparse::csc2csr( + cscMat.get_m(), cscMat.get_n(), cscMat.get_nnz(), cscMat.get_vals(), + cscMat.get_row_ids(), cscMat.get_col_map()); + + // TODO check csrMat against cscMat +} + +template +void doAllScalarsCsc2Csr(size_t m, size_t n, int min, int max) { + doCsc2Csr(m, n, min, max); + doCsc2Csr(m, n, min, max); + doCsc2Csr, LayoutType, ExeSpaceType>(m, n, min, max); + doCsc2Csr, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCsc2Csr(size_t m, size_t n, int min, int max) { + doAllScalarsCsc2Csr(m, n, min, max); + doAllScalarsCsc2Csr(m, n, min, max); +} + +template +void doAllCsc2csr(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCsc2Csr(m, n, min, max); +} + +TEST_F(TestCategory, sparse_csc2csr) { + // Square cases + for (size_t dim = 1; dim < 1024; dim *= 4) + doAllCsc2csr(dim, dim); + + // Non-square cases + for (size_t dim = 1; dim < 1024; dim *= 4) { + doAllCsc2csr(dim * 3, dim); + doAllCsc2csr(dim, dim * 3); + } +} } // namespace Test \ No newline at end of file From 58d74bf62112f1b7d216686d3fc21607a70958e4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 2 Mar 2022 16:36:30 -0700 Subject: [PATCH 059/261] src/sparse: Add initial csc2csr impl --- src/sparse/KokkosSparse_csc2csr.hpp | 194 +++++++++++++++++++-- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 4 +- unit_test/sparse/Test_Sparse_csc2csr.hpp | 41 ++++- 3 files changed, 222 insertions(+), 17 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 558f362568..17aaf6b85b 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -42,13 +42,18 @@ //@HEADER */ +#include "KokkosKernels_Utils.hpp" +#include +#include + #ifndef _KOKKOSSPARSE_CSC2CSR_HPP #define _KOKKOSSPARSE_CSC2CSR_HPP namespace KokkosSparse { +namespace Impl { template -auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, - ValViewType vals, RowIdViewType row_ids, ColMapViewType col_map) { +class Csc2Csr { + private: using CrsST = typename ValViewType::value_type; using CrsOT = OrdinalType; using CrsDT = typename ValViewType::execution_space; @@ -59,18 +64,183 @@ auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; using CrsColIdViewType = typename CrsType::index_type; - CrsValsViewType crs_vals( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr vals"), nnz); - CrsRowMapViewType crs_row_map( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr row_map"), - nrows + 1); - CrsColIdViewType crs_col_ids( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "csc2csr col_ids"), nnz); + OrdinalType __nrows; + OrdinalType __ncols; + SizeType __nnz; + ValViewType __vals; + RowIdViewType __row_ids; + ColMapViewType __col_map; + + RowIdViewType __crs_row_cnt; + + CrsValsViewType __crs_vals; + CrsRowMapViewType __crs_row_map; + CrsRowMapViewType __crs_row_map_scratch; + CrsColIdViewType __crs_col_ids; + + struct AlgoTags { + struct s1RowCnt {}; + struct s2RowMap {}; + struct s3Copy {}; + }; + + using s1RowCntTag = typename AlgoTags::s1RowCnt; + using s3CopyTag = typename AlgoTags::s3Copy; + + using TeamPolicyType = Kokkos::TeamPolicy; + + int __suggested_team_size, __suggested_vec_size, __league_size; + + template + void __run(FunctorType &functor) { + // s1RowCntTag + { + Kokkos::parallel_for("Csc2Csr", + Kokkos::RangePolicy(0, __nnz), + functor); + CrsDT().fence(); + } + // s2RowMapTag + { + namespace KE = Kokkos::Experimental; + CrsDT crsDT; + KE::exclusive_scan(crsDT, KE::cbegin(__crs_row_cnt), + KE::cend(__crs_row_cnt), KE::begin(__crs_row_map), 0); + __crs_row_map(__nrows) = __nnz; + CrsDT().fence(); + Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map); + CrsDT().fence(); + } + // s3CopyTag + { + TeamPolicyType teamPolicy(__ncols, __suggested_team_size, + __suggested_vec_size); + Kokkos::parallel_for("Csc2Csr", teamPolicy, functor); + CrsDT().fence(); + } + // TODO: s3CopySortCompressTag + } + + public: + template + class __Functor { + private: + OrdinalType __nrows; + OrdinalType __ncols; + SizeType __nnz; + ValViewType &__vals; + CrsValsViewType &__crs_vals; + RowIdViewType &__row_ids; + CrsRowMapViewType &__crs_row_map; + CrsRowMapViewType &__crs_row_map_scratch; + ColMapViewType &__col_map; + CrsColIdViewType &__crs_col_ids; + RowIdViewType &__crs_row_cnt; + + public: + __Functor(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType &vals, CrsValsViewType &crs_vals, + RowIdViewType &row_ids, CrsRowMapViewType &crs_row_map, + CrsRowMapViewType &crs_row_map_scratch, ColMapViewType &col_map, + CrsColIdViewType &crs_col_ids, RowIdViewType &crs_row_cnt) + : __nrows(nrows), + __ncols(ncols), + __nnz(nnz), + __vals(vals), + __crs_vals(crs_vals), + __row_ids(row_ids), + __crs_row_map(crs_row_map), + __crs_row_map_scratch(crs_row_map_scratch), + __col_map(col_map), + __crs_col_ids(crs_col_ids), + __crs_row_cnt(crs_row_cnt){}; - // TODO: populate crs views + KOKKOS_INLINE_FUNCTION + void operator()(const s3CopyTag &, const MemberType &member) const { + auto j = member.league_rank(); + auto col_start = __col_map(j); + auto col_len = __col_map(j + 1) - col_start; - return CrsType("csc2csr", nrows, ncols, nnz, crs_vals, crs_row_map, - crs_col_ids); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, col_len), [&](const int &k) { + auto idx = col_start + k; + auto i = __row_ids(idx); + auto crs_idx = + Kokkos::atomic_fetch_inc(&__crs_row_map_scratch.data()[i]); + __crs_col_ids(crs_idx) = j; + __crs_vals(crs_idx) = __vals(idx); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const s1RowCntTag &, const int &thread_id) const { + Kokkos::atomic_inc(&__crs_row_cnt.data()[__row_ids(thread_id)]); + } + }; + + Csc2Csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, + RowIdViewType row_ids, ColMapViewType col_map, int league_size = 2) + : __nrows(nrows), + __ncols(ncols), + __nnz(nnz), + __vals(vals), + __row_ids(row_ids), + __col_map(col_map), + __league_size(league_size) { + __crs_vals = CrsValsViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_vals"), nnz); + __crs_row_map = CrsRowMapViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_row_map"), + nrows + 1); + __crs_row_map_scratch = + CrsRowMapViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "__crs_row_map_scratch"), + nrows + 1); + __crs_col_ids = CrsColIdViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_col_ids"), nnz); + + __crs_row_cnt = RowIdViewType("__crs_row_cnt", __nrows); + + __Functor functor( + __nrows, __ncols, __nnz, __vals, __crs_vals, __row_ids, __crs_row_map, + __crs_row_map_scratch, __col_map, __crs_col_ids, __crs_row_cnt); + + KokkosKernels::Impl::get_suggested_vector_size( + __suggested_vec_size, __nrows, __nnz); + __suggested_team_size = + KokkosKernels::Impl::get_suggested_team_size( + functor, __suggested_vec_size); + + __run(functor); + } + + CrsType get_csrMat() { + return CrsType("csc2csr", __nrows, __ncols, __nnz, __crs_vals, + __crs_row_map, __crs_col_ids); + } +}; +} // namespace Impl +/// +/// \brief Converts a csc matrix to a CrsMatrix. +/// \tparam OrdinalType The view value type associated with the RowIdViewType +/// \tparam SizeType The type of nnz +/// \tparam ValViewType The values view type +/// \tparam RowIdViewType The row ids view type +/// \tparam ColMapViewType The column map view type +/// \param nrows The number of rows in the csc matrix +/// \param ncols The number of columns in the csc matrix +/// \param nnz The number of non-zeros in the csc matrix +/// \param vals The values view of the csc matrix +/// \param row_ids The row ids view of the csc matrix +/// \param col_map The column map view of the csc matrix +/// \return A KokkosSparse::CrsMatrix. +template +auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, RowIdViewType row_ids, ColMapViewType col_map, + int league_size) { + Impl::Csc2Csr csc2Csr(nrows, ncols, nnz, vals, row_ids, col_map, league_size); + return csc2Csr.get_csrMat(); } } // namespace KokkosSparse #endif // _KOKKOSSPARSE_CSC2CSR_HPP \ No newline at end of file diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 41843d8674..fcd02a851e 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -94,7 +94,7 @@ struct SPMV_Transpose_Functor { AMatrix m_A; XVector m_x; YVector m_y; - ordinal_type rows_per_team; + ordinal_type rows_per_team = 0; SPMV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const YVector& m_y_) @@ -725,7 +725,7 @@ struct SPMV_MV_Transpose_Functor { YVector m_y; const ordinal_type n; - ordinal_type rows_per_team; + ordinal_type rows_per_team = 0; SPMV_MV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp index 0633be7312..b0f433639e 100644 --- a/unit_test/sparse/Test_Sparse_csc2csr.hpp +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -50,12 +50,47 @@ template void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { RandCscMat cscMat(m, n, min_val, max_val); + constexpr int league_size = 32; auto csrMat = KokkosSparse::csc2csr( cscMat.get_m(), cscMat.get_n(), cscMat.get_nnz(), cscMat.get_vals(), - cscMat.get_row_ids(), cscMat.get_col_map()); + cscMat.get_row_ids(), cscMat.get_col_map(), league_size); - // TODO check csrMat against cscMat + auto csc_row_ids = cscMat.get_row_ids(); + auto csc_col_map = cscMat.get_col_map(); + auto csc_vals = cscMat.get_vals(); + + auto csr_col_ids = csrMat.graph.entries; + auto csr_row_map = csrMat.graph.row_map; + auto csr_vals = csrMat.values; + + for (int j = 0; j < cscMat.get_n(); ++j) { + auto col_start = csc_col_map(j); + auto col_len = csc_col_map(j + 1) - col_start; + + for (int k = 0; k < col_len; ++k) { + auto i = col_start + k; + + auto row_start = csr_row_map(csc_row_ids(i)); + auto row_len = csr_row_map(csc_row_ids(i) + 1) - row_start; + auto row_end = row_start + row_len; + + if (row_len == 0) continue; + + // Linear search for corresponding element in csr matrix + int l = row_start; + while (l < row_end && csr_col_ids(l) != j) { + ++l; + } + + if (l == row_end) + FAIL() << "csr element at (i: " << csc_row_ids(i) << ", j: " << j + << ") not found!" << std::endl; + + ASSERT_EQ(csc_vals(i), csr_vals(l)) + << "(i: " << csc_row_ids(i) << ", j: " << j << ")" << std::endl; + } + } } template @@ -80,7 +115,7 @@ void doAllCsc2csr(size_t m, size_t n) { TEST_F(TestCategory, sparse_csc2csr) { // Square cases - for (size_t dim = 1; dim < 1024; dim *= 4) + for (size_t dim = 4; dim < 1024; dim *= 4) doAllCsc2csr(dim, dim); // Non-square cases From f00fd886ee4e9cdf474d7d676de10451401c36b0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 17 Mar 2022 15:56:30 -0600 Subject: [PATCH 060/261] Implement feedback --- .../impl/KokkosGraph_Distance1Color_impl.hpp | 1 - src/sparse/KokkosSparse_csc2csr.hpp | 31 ++++++++++--------- test_common/KokkosKernels_TestUtils.hpp | 2 -- .../Test_Sparse_TestUtils_RandCscMat.hpp | 1 - 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 1e2433def8..39e27795cc 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -417,7 +417,6 @@ class GraphColor_VB double total_time_serial_conflict_resolution = 0.0; Kokkos::Timer timer; timer.reset(); - (void)total; int iter = 0; for (; (iter < this->_max_num_iterations) && (numUncolored > 0); iter++) { diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 17aaf6b85b..f19368f15f 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -56,10 +56,10 @@ class Csc2Csr { private: using CrsST = typename ValViewType::value_type; using CrsOT = OrdinalType; - using CrsDT = typename ValViewType::execution_space; + using CrsET = typename ValViewType::execution_space; using CrsMT = void; using CrsSzT = SizeType; - using CrsType = CrsMatrix; + using CrsType = CrsMatrix; using CrsValsViewType = typename CrsType::values_type; using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; using CrsColIdViewType = typename CrsType::index_type; @@ -87,7 +87,7 @@ class Csc2Csr { using s1RowCntTag = typename AlgoTags::s1RowCnt; using s3CopyTag = typename AlgoTags::s3Copy; - using TeamPolicyType = Kokkos::TeamPolicy; + using TeamPolicyType = Kokkos::TeamPolicy; int __suggested_team_size, __suggested_vec_size, __league_size; @@ -96,27 +96,28 @@ class Csc2Csr { // s1RowCntTag { Kokkos::parallel_for("Csc2Csr", - Kokkos::RangePolicy(0, __nnz), + Kokkos::RangePolicy(0, __nnz), functor); - CrsDT().fence(); + CrsET().fence(); } // s2RowMapTag { namespace KE = Kokkos::Experimental; - CrsDT crsDT; - KE::exclusive_scan(crsDT, KE::cbegin(__crs_row_cnt), - KE::cend(__crs_row_cnt), KE::begin(__crs_row_map), 0); - __crs_row_map(__nrows) = __nnz; - CrsDT().fence(); + CrsET crsET; + KE::inclusive_scan(crsET, KE::cbegin(__crs_row_cnt), + KE::cend(__crs_row_cnt), KE::begin(__crs_row_map) + 1); + __crs_row_map(0) = 0; + assert(__crs_row_map(__nrows) == __nnz); + CrsET().fence(); Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map); - CrsDT().fence(); + CrsET().fence(); } // s3CopyTag { TeamPolicyType teamPolicy(__ncols, __suggested_team_size, __suggested_vec_size); Kokkos::parallel_for("Csc2Csr", teamPolicy, functor); - CrsDT().fence(); + CrsET().fence(); } // TODO: s3CopySortCompressTag } @@ -205,7 +206,7 @@ class Csc2Csr { __nrows, __ncols, __nnz, __vals, __crs_vals, __row_ids, __crs_row_map, __crs_row_map_scratch, __col_map, __crs_col_ids, __crs_row_cnt); - KokkosKernels::Impl::get_suggested_vector_size( + KokkosKernels::Impl::get_suggested_vector_size( __suggested_vec_size, __nrows, __nnz); __suggested_team_size = KokkosKernels::Impl::get_suggested_team_size( @@ -239,7 +240,9 @@ template ; + Csc2csrType csc2Csr(nrows, ncols, nnz, vals, row_ids, col_map, league_size); return csc2Csr.get_csrMat(); } } // namespace KokkosSparse diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index f5009154a6..8e32cf38f2 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -129,8 +129,6 @@ static inline const std::string kk_failure_str(std::string file, std::string func, const int line) { std::string failure_msg = " > from "; - // std::string test = - // ::testing::UnitTest::GetInstance()->current_test_info()->name(); failure_msg += (file + ":" + func + ":" + std::to_string(line) + "\n > "); return std::string(failure_msg); } diff --git a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp index 1d2589be21..fc33f9f08b 100644 --- a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp +++ b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp @@ -51,7 +51,6 @@ void doCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { int64_t expected_nnz = 0; RandCscMat cm(m, n, min_val, max_val); - std::string fmsg = kk_failure_str(__FILE__, __FUNCTION__, __LINE__); for (int64_t i = 0; i < cm.get_nnz(); ++i) ASSERT_GE(cm(i), expected_min) << cm.info; From 6a222463e3bd65ecf19fe57110ce275e7ed203a2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 17 Mar 2022 16:14:47 -0600 Subject: [PATCH 061/261] Add fully sparse test cases --- test_common/KokkosKernels_TestUtils.hpp | 24 ++++++++++++++---------- unit_test/sparse/Test_Sparse_csc2csr.hpp | 11 ++++++++--- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 8e32cf38f2..00810f77cd 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -506,6 +506,7 @@ class RandCscMat { ColMapViewType __col_map; RowIdViewType __row_ids; ValViewType __vals; + bool __fully_sparse; /// Generates a random column map where: /// 1. __col_map(i) is in [__row_ids.data(), &row_ids.data()[nnz - 1] @@ -516,7 +517,7 @@ class RandCscMat { std::srand(ticks); for (int64_t col_idx = 0; col_idx < __ncols; col_idx++) { int64_t r = std::rand() % (__nrows + 1); - if (r == 0) { // 100% sparse column + if (r == 0 || __fully_sparse) { // 100% sparse column __col_map(col_idx) = __nnz; } else { // sparse column with r elements // Populate r row ids @@ -554,10 +555,12 @@ class RandCscMat { /// \param n The number of columns. /// \param min_val The minimum scalar value in the matrix. /// \param max_val The maximum scalar value in the matrix. - RandCscMat(int64_t m, int64_t n, ScalarType min_val, ScalarType max_val) { - __ncols = n; - __nrows = m; - __col_map = ColMapViewType("RandCscMat.ColMapViewType", __ncols + 1); + RandCscMat(int64_t m, int64_t n, ScalarType min_val, ScalarType max_val, + bool fully_sparse = false) { + __ncols = n; + __nrows = m; + __fully_sparse = fully_sparse; + __col_map = ColMapViewType("RandCscMat.ColMapViewType", __ncols + 1); __row_ids = RowIdViewType("RandCscMat.RowIdViewType", m * n + 1); // over-allocated @@ -565,11 +568,12 @@ class RandCscMat { std::chrono::high_resolution_clock::now().time_since_epoch().count() % UINT32_MAX; - info = std::string(std::string("RandCscMat<") + typeid(ScalarType).name() + - ", " + typeid(LayoutType).name() + ", " + - typeid(ExeSpaceType).name() + ">(" + std::to_string(m) + - ", " + std::to_string(n) + - "...): rand seed: " + std::to_string(ticks) + "\n"); + info = std::string( + std::string("RandCscMat<") + typeid(ScalarType).name() + ", " + + typeid(LayoutType).name() + ", " + typeid(ExeSpaceType).name() + ">(" + + std::to_string(m) + ", " + std::to_string(n) + + "...): rand seed: " + std::to_string(ticks) + + ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n"); Kokkos::Random_XorShift64_Pool random(ticks); __populate_random_csc_mat(ticks); diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp index b0f433639e..cdc70e4f0b 100644 --- a/unit_test/sparse/Test_Sparse_csc2csr.hpp +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -47,9 +47,10 @@ namespace Test { template -void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { - RandCscMat cscMat(m, n, min_val, - max_val); +void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val, + bool fully_sparse = false) { + RandCscMat cscMat( + m, n, min_val, max_val, fully_sparse); constexpr int league_size = 32; auto csrMat = KokkosSparse::csc2csr( @@ -123,5 +124,9 @@ TEST_F(TestCategory, sparse_csc2csr) { doAllCsc2csr(dim * 3, dim); doAllCsc2csr(dim, dim * 3); } + + // Fully sparse + doCsc2Csr(5, 5, 1, 10, true); + doCsc2Csr(50, 10, 10, 100, true); } } // namespace Test \ No newline at end of file From e30ac9d198b7e17494df3b5b4d22488dedb1a384 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Mar 2022 17:03:10 -0600 Subject: [PATCH 062/261] Start restructuring docs --- docs/conf.py | 6 +-- docs/developer/apidocs.rst | 14 +++++++ docs/developer/apidocs/batched_dense.rst | 9 +++++ docs/developer/apidocs/batched_sparse.rst | 5 +++ docs/developer/apidocs/blas1.rst | 17 +++++++++ docs/developer/apidocs/blas2.rst | 4 ++ docs/developer/apidocs/blas3.rst | 4 ++ docs/developer/apidocs/sparse.rst | 9 +++++ docs/developer/contrib.rst | 46 +++++++++++++++++++++++ docs/developer/index.rst | 10 +++++ docs/developer/style.rst | 34 +++++++++++++++++ docs/developer/write_developer_doc.rst | 0 docs/developer/write_user_doc.rst | 0 docs/index.rst | 33 ++-------------- 14 files changed, 159 insertions(+), 32 deletions(-) create mode 100644 docs/developer/apidocs.rst create mode 100644 docs/developer/apidocs/batched_dense.rst create mode 100644 docs/developer/apidocs/batched_sparse.rst create mode 100644 docs/developer/apidocs/blas1.rst create mode 100644 docs/developer/apidocs/blas2.rst create mode 100644 docs/developer/apidocs/blas3.rst create mode 100644 docs/developer/apidocs/sparse.rst create mode 100644 docs/developer/contrib.rst create mode 100644 docs/developer/index.rst create mode 100644 docs/developer/style.rst create mode 100644 docs/developer/write_developer_doc.rst create mode 100644 docs/developer/write_user_doc.rst diff --git a/docs/conf.py b/docs/conf.py index efb406329b..59377e4f11 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'Kokkos Kernels' -copyright = '2021, Evan Harvey' -author = 'Evan Harvey' +copyright = '2022, Kokkos Development Team' +author = 'Kokkos Team' # The full version, including alpha/beta/rc tags -release = 'v3.4.1' +release = 'latest' # -- General configuration --------------------------------------------------- diff --git a/docs/developer/apidocs.rst b/docs/developer/apidocs.rst new file mode 100644 index 0000000000..82797c5801 --- /dev/null +++ b/docs/developer/apidocs.rst @@ -0,0 +1,14 @@ +Source Code Documentation +========================= + +The source documentation is extracted from the C++ files using Doxygen. + +.. toctree:: + :maxdepth: 4 + + apidocs/blas1 + apidocs/blas2 + apidocs/blas3 + apidocs/sparse + apidocs/batched_dense + apidocs/batched_sparse \ No newline at end of file diff --git a/docs/developer/apidocs/batched_dense.rst b/docs/developer/apidocs/batched_dense.rst new file mode 100644 index 0000000000..cc4040bb80 --- /dev/null +++ b/docs/developer/apidocs/batched_dense.rst @@ -0,0 +1,9 @@ +BATCHED -- KokkosKernels batched functor-level interfaces +========================================================= + +.. doxygenclass:: KokkosBatched::SerialAxpby + :members: +.. doxygenclass:: KokkosBatched::TeamAxpby + :members: +.. doxygenclass:: KokkosBatched::TeamVectorAxpby + :members: diff --git a/docs/developer/apidocs/batched_sparse.rst b/docs/developer/apidocs/batched_sparse.rst new file mode 100644 index 0000000000..38592c90fd --- /dev/null +++ b/docs/developer/apidocs/batched_sparse.rst @@ -0,0 +1,5 @@ +SPARSE BATCHED -- KokkosKernels sparse batched functor-level interfaces +======================================================================= + +.. doxygenclass:: KokkosBatched::CG + :members: \ No newline at end of file diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst new file mode 100644 index 0000000000..3fddfc29c6 --- /dev/null +++ b/docs/developer/apidocs/blas1.rst @@ -0,0 +1,17 @@ +BLAS1 -- KokkosKernels blas1 interfaces +======================================= + +.. doxygenfunction:: KokkosBlas::axpby +.. doxygenfunction:: KokkosBlas::dot +.. doxygenfunction:: KokkosBlas::fill +.. doxygenfunction:: KokkosBlas::mult +.. doxygenfunction:: KokkosBlas::nrm1 +.. doxygenfunction:: KokkosBlas::nrm2 +.. doxygenfunction:: KokkosBlas::nrm2 +.. doxygenfunction:: KokkosBlas::nrm2w +.. doxygenfunction:: KokkosBlas::nrm2w +.. doxygenfunction:: KokkosBlas::nrminf +.. doxygenfunction:: KokkosBlas::reciprocal +.. doxygenfunction:: KokkosBlas::scal +.. doxygenfunction:: KokkosBlas::sum +.. doxygenfunction:: KokkosBlas::update diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst new file mode 100644 index 0000000000..bd7f3c8d70 --- /dev/null +++ b/docs/developer/apidocs/blas2.rst @@ -0,0 +1,4 @@ +BLAS2 -- KokkosKernels blas2 interfaces +======================================= + +.. doxygenfunction:: KokkosBlas::gemv diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst new file mode 100644 index 0000000000..8dda488a74 --- /dev/null +++ b/docs/developer/apidocs/blas3.rst @@ -0,0 +1,4 @@ +BLAS3 -- KokkosKernels blas3 interfaces +======================================= + +.. doxygenfunction:: KokkosBlas::gemm diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst new file mode 100644 index 0000000000..e1bd74babb --- /dev/null +++ b/docs/developer/apidocs/sparse.rst @@ -0,0 +1,9 @@ +SPARSE -- KokkosKernels sparse interfaces +========================================= + +.. doxygenclass:: KokkosSparse::CrsMatrix + :members: +.. doxygenfunction:: KokkosSparse::spmv +.. doxygenfunction:: KokkosSparse::trsv +.. doxygenfunction:: KokkosSparse::spgemm +.. doxygenfunction:: KokkosSparse::gauss diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst new file mode 100644 index 0000000000..495e8cbd94 --- /dev/null +++ b/docs/developer/contrib.rst @@ -0,0 +1,46 @@ +Contributing +============ + +Comment Style +------------- +We follow doxygen style comments for both external (API) and internal members. See https://www.doxygen.nl/manual/docblocks.html for details. +Our documentation can be generated using the `-DKokkosKernels_ENABLE_DOCS:BOOL=ON` cmake flag. + +In general, we prefer that the prototype has the doxygen style comment rather than the definition. If there is no prototype, then the definition should have the doxygen style comment. + +.. code-block:: + :caption: API Doxygen Style Example + + /// \brief Blocking wrapper for accessing a Kokkos View. + /// \tparam ViewValueType The value type (Scalar or Vector) of each view element + /// \tparam ViewType The view type + /// \param v The view handle + /// \param m The requested row index of v + /// \param n The requested col index of v + /// \return If m and n are within the extents of v, a valid element of v; + /// otherwise, the last element of v. + /// + template + KOKKOS_INLINE_FUNCTION ViewValueType + access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &); + +Library policies +---------------- + +System-specific functions +------------------------- +For portability, any system-specific function that is not in the C++ standard should not be invoked from kokkos-kernels. + +Upcasting and downcasting +------------------------- +TODO + +Blocking and non-blocking interfaces +------------------------------------ +All the APIs are non-blocking unless: +1. A TPL is enabled +2. The result vector resides on the host and work is offloaded to a device + +When a TPL is enabled, we follow the blocking semantics of the TPL interface. + +If no TPLs are enabled, callers can avoid blocking calls by using any overload which accepts a result vector type as a template argument. \ No newline at end of file diff --git a/docs/developer/index.rst b/docs/developer/index.rst new file mode 100644 index 0000000000..d45eb38474 --- /dev/null +++ b/docs/developer/index.rst @@ -0,0 +1,10 @@ +Developer Manual +================ + +.. toctree:: + :maxdepth: 2 + + Source Code Documentation + Building the Documentation + Code Style Guide + Contributing \ No newline at end of file diff --git a/docs/developer/style.rst b/docs/developer/style.rst new file mode 100644 index 0000000000..ddd9ce5197 --- /dev/null +++ b/docs/developer/style.rst @@ -0,0 +1,34 @@ +Style Guide +=========== + +We follow google's c++ coding style. See https://google.github.io/styleguide/cppguide.html and https://github.com/kokkos/kokkos-kernels/blob/master/.clang-format for details. + +.. code-block:: + :caption: Automate coding style via a pre-commit hook + + cat kokkos-kernels/.git/hooks/pre-commit + for FILE in $(git diff --cached --name-only | egrep '.*\.cpp$|.*\.hpp$|.*\.h$') + do + if [ -e $file ]; then + clang-format-8 -i -style=file $FILE + git add $FILEA + fi + done + chmod +x kokkos-kernels/.git/hooks/pre-commit + +.. code-block:: + :caption: Conditionally enable or disable formatting + + // clang-format off + cpp code here + // clang-format on + +.. code-block:: + :caption: Instal clang-format on MacOS + + brew install clang-format-8 + +.. code-block:: + :caption: Instal clang-format on Ubuntu + + apt install clang-format-8 \ No newline at end of file diff --git a/docs/developer/write_developer_doc.rst b/docs/developer/write_developer_doc.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/developer/write_user_doc.rst b/docs/developer/write_user_doc.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/index.rst b/docs/index.rst index 06240595bf..f5dded3aad 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,33 +1,8 @@ -.. Kokkos Kernels documentation master file, created by - sphinx-quickstart on Fri Sep 24 13:19:45 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Kokkos Kernels's documentation! +Kokkos Kernels documentation ========================================== - .. toctree:: :maxdepth: 2 - :caption: Contents: - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - -Docs -==== -.. doxygennamespace:: KokkosBlas - :project: KokkosKernels - :members: -.. doxygennamespace:: KokkosSparse - :project: KokkosKernels - :members: -.. doxygennamespace:: KokkosBatched - :project: KokkosKernels - :members: \ No newline at end of file + KokkosKernels GitHub Homepage + User Manual + Developer API Docs From 3b37dfc5d5b4cd2b1a362d68b9645cfc54e0ef48 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 22 Mar 2022 21:04:41 -0600 Subject: [PATCH 063/261] KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes Attempt to address #1366 --- perf_test/sparse/KokkosSparse_spiluk.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index d381b9b888..70d160c83e 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -257,6 +257,10 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { std::cout << "CUSPARSE: No KK interface added yet" << std::endl; @@ -412,6 +416,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, } // end row std::cout << "ILU(0) SUCCESS!" << std::endl; } // fill_lev=0 +#endif #endif // Benchmark @@ -436,6 +441,10 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { lno_view_t A_row_map("A_row_map", nrows + 1); lno_nnz_view_t A_entries("A_entries", nnz); @@ -465,15 +474,21 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME (cuSPARSE): " << max_time << std::endl; std::cout << "LOOP_MIN_TIME (cuSPARSE): " << min_time << std::endl; } // fill_lev=0 +#endif #endif } // end tests #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrilu02Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); +#endif #endif } // end if (!afilename.empty()) From b7021116944b7bd2958d9e80e2bf0b5759bdecd2 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 23 Mar 2022 11:10:47 -0600 Subject: [PATCH 064/261] clang-format fix --- perf_test/sparse/KokkosSparse_spiluk.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index 70d160c83e..2ee9573880 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -257,8 +257,8 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) #if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { @@ -441,8 +441,8 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - // cuSPARSE requires lno_t = size_type = int. For both, int is always used - // (if enabled) + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) #if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { From 9289d2648248c4a38439a0960345736ea172dad8 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 23 Mar 2022 17:13:18 -0600 Subject: [PATCH 065/261] Fix check that view has const value type --- src/sparse/KokkosSparse_getDiagCopy.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sparse/KokkosSparse_getDiagCopy.hpp b/src/sparse/KokkosSparse_getDiagCopy.hpp index a96d0c3a10..c1d45b13ec 100644 --- a/src/sparse/KokkosSparse_getDiagCopy.hpp +++ b/src/sparse/KokkosSparse_getDiagCopy.hpp @@ -61,7 +61,8 @@ void getDiagCopy(const DiagType& D, const OffsetsType& offsets, static_assert(static_cast(DiagType::rank) == 1, "The DiagType template parameter must be a 1-D Kokkos::View."); static_assert( - std::is_same::value, + std::is_same::value, "The DiagType template parameter must be a nonconst Kokkos::View."); static_assert(Kokkos::is_view::value, "The OffsetsType template parameter must be a Kokkos::View."); From 83f2c9948ea12c6f3314d38c34d78e4086fbd361 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Mar 2022 17:09:22 -0600 Subject: [PATCH 066/261] Fix cuda errors --- src/sparse/KokkosSparse_csc2csr.hpp | 2 + test_common/KokkosKernels_TestUtils.hpp | 48 ++++++++++++++++-------- unit_test/sparse/Test_Sparse_csc2csr.hpp | 46 +++++++++++++++++++---- 3 files changed, 74 insertions(+), 22 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index f19368f15f..ce06a4a729 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -78,6 +78,7 @@ class Csc2Csr { CrsRowMapViewType __crs_row_map_scratch; CrsColIdViewType __crs_col_ids; + public: struct AlgoTags { struct s1RowCnt {}; struct s2RowMap {}; @@ -87,6 +88,7 @@ class Csc2Csr { using s1RowCntTag = typename AlgoTags::s1RowCnt; using s3CopyTag = typename AlgoTags::s3Copy; + private: using TeamPolicyType = Kokkos::TeamPolicy; int __suggested_team_size, __suggested_vec_size, __league_size; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 00810f77cd..815afed38b 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -497,15 +497,21 @@ int string_compare_no_case(const char* str1, const char* str2) { template class RandCscMat { private: - using ValViewType = Kokkos::View; - using RowIdViewType = Kokkos::View; - using ColMapViewType = Kokkos::View; + using ValViewTypeD = Kokkos::View; + using RowIdViewTypeD = Kokkos::View; + using ColMapViewTypeD = Kokkos::View; + using ValViewTypeH = Kokkos::View; + using RowIdViewTypeH = Kokkos::View; + using ColMapViewTypeH = Kokkos::View; int64_t __nrows; int64_t __ncols; int64_t __nnz = 0; - ColMapViewType __col_map; - RowIdViewType __row_ids; - ValViewType __vals; + ColMapViewTypeD __col_map_d; + RowIdViewTypeD __row_ids_d; + ValViewTypeD __vals_d; + ColMapViewTypeH __col_map; + RowIdViewTypeH __row_ids; + ValViewTypeH __vals; bool __fully_sparse; /// Generates a random column map where: @@ -537,6 +543,11 @@ class RandCscMat { // last entry in map points to end of row id list __col_map(__ncols) = __nnz; + + // Copy to device + Kokkos::deep_copy(__col_map_d, __col_map); + Kokkos::deep_copy(__row_ids_d, __row_ids); + ExeSpaceType().fence(); } template @@ -560,9 +571,11 @@ class RandCscMat { __ncols = n; __nrows = m; __fully_sparse = fully_sparse; - __col_map = ColMapViewType("RandCscMat.ColMapViewType", __ncols + 1); - __row_ids = - RowIdViewType("RandCscMat.RowIdViewType", m * n + 1); // over-allocated + __col_map_d = ColMapViewTypeD("RandCscMat.ColMapViewType", __ncols + 1); + __col_map = Kokkos::create_mirror_view(__col_map_d); + __row_ids_d = RowIdViewTypeD("RandCscMat.RowIdViewType", + m * n + 1); // over-allocated + __row_ids = Kokkos::create_mirror_view(__row_ids_d); uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count() % @@ -574,13 +587,18 @@ class RandCscMat { std::to_string(m) + ", " + std::to_string(n) + "...): rand seed: " + std::to_string(ticks) + ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n"); - Kokkos::Random_XorShift64_Pool random(ticks); + Kokkos::Random_XorShift64_Pool random(ticks); __populate_random_csc_mat(ticks); - __vals = ValViewType("RandCscMat.ValViewType", __nnz + 1); + __vals_d = ValViewTypeD("RandCscMat.ValViewType", __nnz + 1); + __vals = Kokkos::create_mirror_view(__vals_d); Kokkos::fill_random(__vals, random, min_val, max_val); // random scalars - ExeSpaceType().fence(); + Kokkos::fence(); __vals(__nnz) = ScalarType(0); + + // Copy to device + Kokkos::deep_copy(__vals_d, __vals); + ExeSpaceType().fence(); } // O(c), where c is a constant. @@ -593,9 +611,9 @@ class RandCscMat { return j < __ncols ? (__col_map(j + 1) - __col_map(j)) : 0; } int64_t get_col_start(int64_t j) { return j < __ncols ? __col_map(j) : 0; } - ValViewType get_vals() { return __getter_copy_helper(__vals); } - RowIdViewType get_row_ids() { return __getter_copy_helper(__row_ids); } - ColMapViewType get_col_map() { return __getter_copy_helper(__col_map); } + ValViewTypeD get_vals() { return __getter_copy_helper(__vals_d); } + RowIdViewTypeD get_row_ids() { return __getter_copy_helper(__row_ids_d); } + ColMapViewTypeD get_col_map() { return __getter_copy_helper(__col_map_d); } }; } // namespace Test diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp index cdc70e4f0b..e7d2ad868e 100644 --- a/unit_test/sparse/Test_Sparse_csc2csr.hpp +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -57,13 +57,45 @@ void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val, cscMat.get_m(), cscMat.get_n(), cscMat.get_nnz(), cscMat.get_vals(), cscMat.get_row_ids(), cscMat.get_col_map(), league_size); - auto csc_row_ids = cscMat.get_row_ids(); - auto csc_col_map = cscMat.get_col_map(); - auto csc_vals = cscMat.get_vals(); - - auto csr_col_ids = csrMat.graph.entries; - auto csr_row_map = csrMat.graph.row_map; - auto csr_vals = csrMat.values; + auto csc_row_ids_d = cscMat.get_row_ids(); + auto csc_col_map_d = cscMat.get_col_map(); + auto csc_vals_d = cscMat.get_vals(); + + using ViewTypeRowIds = decltype(csc_row_ids_d); + using ViewTypeColMap = decltype(csc_col_map_d); + using ViewTypeVals = decltype(csc_vals_d); + + // Copy to host + typename ViewTypeRowIds::HostMirror csc_row_ids = + Kokkos::create_mirror_view(csc_row_ids_d); + Kokkos::deep_copy(csc_row_ids, csc_row_ids_d); + typename ViewTypeColMap::HostMirror csc_col_map = + Kokkos::create_mirror_view(csc_col_map_d); + Kokkos::deep_copy(csc_col_map, csc_col_map_d); + typename ViewTypeVals::HostMirror csc_vals = + Kokkos::create_mirror_view(csc_vals_d); + Kokkos::deep_copy(csc_vals, csc_vals_d); + + auto csr_col_ids_d = csrMat.graph.entries; + auto csr_row_map_d = csrMat.graph.row_map; + auto csr_vals_d = csrMat.values; + + using ViewTypeCsrColIds = decltype(csr_col_ids_d); + using ViewTypeCsrRowMap = decltype(csr_row_map_d); + using ViewTypeCsrVals = decltype(csr_vals_d); + + // Copy to host + typename ViewTypeCsrColIds::HostMirror csr_col_ids = + Kokkos::create_mirror_view(csr_col_ids_d); + Kokkos::deep_copy(csr_col_ids, csr_col_ids_d); + typename ViewTypeCsrRowMap::HostMirror csr_row_map = + Kokkos::create_mirror_view(csr_row_map_d); + Kokkos::deep_copy(csr_row_map, csr_row_map_d); + typename ViewTypeCsrVals::HostMirror csr_vals = + Kokkos::create_mirror_view(csr_vals_d); + Kokkos::deep_copy(csr_vals, csr_vals_d); + + Kokkos::fence(); for (int j = 0; j < cscMat.get_n(); ++j) { auto col_start = csc_col_map(j); From 74cdf36031876d145e53e488f3aa0a7dbac64e9c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 23 Mar 2022 18:50:18 -0600 Subject: [PATCH 067/261] Switch to exclusive scan to avoid copying to/from the host --- src/sparse/KokkosSparse_csc2csr.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index ce06a4a729..90cee9b51c 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -106,10 +106,11 @@ class Csc2Csr { { namespace KE = Kokkos::Experimental; CrsET crsET; - KE::inclusive_scan(crsET, KE::cbegin(__crs_row_cnt), - KE::cend(__crs_row_cnt), KE::begin(__crs_row_map) + 1); - __crs_row_map(0) = 0; - assert(__crs_row_map(__nrows) == __nnz); + // Use exclusive scan so we can allocate the row map uninitialized and + // avoid accessing device views on the host. + KE::exclusive_scan(crsET, KE::cbegin(__crs_row_cnt), + KE::cend(__crs_row_cnt) + 1, KE::begin(__crs_row_map), + 0); CrsET().fence(); Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map); CrsET().fence(); From f94354340a5eb2e621a68c210d39455b5ddc6608 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Mar 2022 11:31:16 -0600 Subject: [PATCH 068/261] Fix cuda 9 build errors --- test_common/KokkosKernels_TestUtils.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 815afed38b..ec27c44f50 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -500,15 +500,15 @@ class RandCscMat { using ValViewTypeD = Kokkos::View; using RowIdViewTypeD = Kokkos::View; using ColMapViewTypeD = Kokkos::View; - using ValViewTypeH = Kokkos::View; - using RowIdViewTypeH = Kokkos::View; - using ColMapViewTypeH = Kokkos::View; int64_t __nrows; int64_t __ncols; int64_t __nnz = 0; ColMapViewTypeD __col_map_d; RowIdViewTypeD __row_ids_d; ValViewTypeD __vals_d; + using ColMapViewTypeH = typename ColMapViewTypeD::HostMirror; + using RowIdViewTypeH = typename RowIdViewTypeD::HostMirror; + using ValViewTypeH = typename ValViewTypeD::HostMirror; ColMapViewTypeH __col_map; RowIdViewTypeH __row_ids; ValViewTypeH __vals; From 15009d74bf9bb5c97431afdc23d0c5d40ba574e4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Mar 2022 13:55:38 -0600 Subject: [PATCH 069/261] Specify blas1 signatures for overload resolution --- docs/developer/apidocs/blas1.rst | 19 +++++++++++-------- docs/index.rst | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index 3fddfc29c6..c6c65b1632 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -2,16 +2,19 @@ BLAS1 -- KokkosKernels blas1 interfaces ======================================= .. doxygenfunction:: KokkosBlas::axpby -.. doxygenfunction:: KokkosBlas::dot +.. doxygenfunction:: KokkosBlas::dot(const RV &, const XMV &, const YMV &, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::dot(const XVector &, const YVector &) .. doxygenfunction:: KokkosBlas::fill .. doxygenfunction:: KokkosBlas::mult -.. doxygenfunction:: KokkosBlas::nrm1 -.. doxygenfunction:: KokkosBlas::nrm2 -.. doxygenfunction:: KokkosBlas::nrm2 -.. doxygenfunction:: KokkosBlas::nrm2w -.. doxygenfunction:: KokkosBlas::nrm2w -.. doxygenfunction:: KokkosBlas::nrminf +.. doxygenfunction:: KokkosBlas::nrm1(const RV &, const XMV &, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm1(const XVector &) +.. doxygenfunction:: KokkosBlas::nrm2(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm2(const XVector &x) +.. doxygenfunction:: KokkosBlas::nrm2w(const RV &R, const XMV &X, const XMV &W, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm2w(const XVector &x, const XVector &w) +.. doxygenfunction:: KokkosBlas::nrminf(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrminf(const XVector &x) .. doxygenfunction:: KokkosBlas::reciprocal .. doxygenfunction:: KokkosBlas::scal -.. doxygenfunction:: KokkosBlas::sum +.. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::update diff --git a/docs/index.rst b/docs/index.rst index f5dded3aad..e0c5ea9a98 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -5,4 +5,4 @@ Kokkos Kernels documentation KokkosKernels GitHub Homepage User Manual - Developer API Docs + Developer Docs From 211a1bbf9bd7dfe07361e671b77157998ff7b8e5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 24 Mar 2022 14:46:36 -0600 Subject: [PATCH 070/261] Fix sign-compare warning in SPMV perf test --- perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index c32968c177..3a631fc743 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -143,7 +143,7 @@ void kk_inspector_matvec(AType A, XType x, YType y, int team_size, workset_offsets(0) = 0; lno_t ws = 1; for (lno_t row = 0; row < A.numRows(); row++) { - if (A.graph.row_map(row) > ws * nnz_per_workset) { + if (A.graph.row_map(row) > size_type(ws) * nnz_per_workset) { workset_offsets(ws) = row; ws++; } From 0035a601e1354546b37aff9f1ae8e1df4cf9c5df Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Mar 2022 15:27:28 -0600 Subject: [PATCH 071/261] Organize API docs --- docs/developer/apidocs/batched_dense.rst | 254 +++++++++++++++++++++- docs/developer/apidocs/batched_sparse.rst | 40 +++- docs/developer/apidocs/blas1.rst | 35 +++ docs/developer/apidocs/blas2.rst | 5 +- docs/developer/apidocs/blas3.rst | 6 +- docs/developer/apidocs/sparse.rst | 20 +- docs/developer/contrib.rst | 2 +- docs/developer/index.rst | 2 +- 8 files changed, 355 insertions(+), 9 deletions(-) diff --git a/docs/developer/apidocs/batched_dense.rst b/docs/developer/apidocs/batched_dense.rst index cc4040bb80..1d65842061 100644 --- a/docs/developer/apidocs/batched_dense.rst +++ b/docs/developer/apidocs/batched_dense.rst @@ -1,9 +1,257 @@ BATCHED -- KokkosKernels batched functor-level interfaces ========================================================= -.. doxygenclass:: KokkosBatched::SerialAxpby +innerlu +------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerLU_Internal.hpp + +applypivot +---------- +.. doxygenstruct:: KokkosBatched::TeamVectorApplyPivot + :members: + +qr_withcolumnpivoting +--------------------- +.. doxygenstruct:: KokkosBatched::TeamVectorQR_WithColumnPivoting + :members: + +addradial +--------- +.. doxygenstruct:: KokkosBatched::SerialAddRadial + :members: +.. doxygenstruct:: KokkosBatched::TeamAddRadial + :members: + +householder +----------- +.. doxygenstruct:: KokkosBatched::SerialHouseholder + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorHouseholder + :members: + +set +--- +.. doxygenstruct:: KokkosBatched::SerialSet + :members: +.. doxygenstruct:: KokkosBatched::TeamSet + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorSet + :members: + +scale +----- +.. doxygenstruct:: KokkosBatched::SerialScale + :members: +.. doxygenstruct:: KokkosBatched::TeamScale + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorScale + :members: + +setidentity +----------- +.. doxygenstruct:: KokkosBatched::SerialSetIdentity + :members: +.. doxygenstruct:: KokkosBatched::TeamSetIdentity + :members: +.. doxygenstruct:: KokkosBatched::SetIdentity + :members: + +applyhouseholder +---------------- +.. doxygenstruct:: KokkosBatched::SerialApplyHouseholder + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorApplyHouseholder + :members: + +innermultipledotproduct +----------------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerMultipleDotProduct_Internal.hpp + +lu +-- +.. doxygenstruct:: KokkosBatched::SerialLU + :members: +.. doxygenstruct:: KokkosBatched::TeamLU + :members: +.. doxygenstruct:: KokkosBatched::LU + :members: + +solveutv +-------- +.. doxygenstruct:: KokkosBatched::TeamVectorSolveUTV + :members: + +utv +--- +.. doxygenstruct:: KokkosBatched::TeamVectorUTV + :members: + +inverselu +--------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InverseLU_Internal.hpp + +svd +--- +.. doxygenstruct:: KokkosBatched::SerialSVD + :members: + +eigendecomposition +------------------ +.. doxygenstruct:: KokkosBatched::SerialEigendecomposition + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorEigendecomposition + :members: + +trtri +----- +.. doxygenstruct:: KokkosBatched::SerialTrtri + :members: + +qr +-- +.. doxygenstruct:: KokkosBatched::SerialQR + :members: +.. doxygenstruct:: KokkosBatched::TeamQR + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorQR + :members: +.. doxygenstruct:: KokkosBatched::QR + :members: + +trmm +---- +.. doxygenstruct:: KokkosBatched::SerialTrmm + :members: + +trsm +---- +.. doxygenstruct:: KokkosBatched::SerialTrsm + :members: +.. doxygenstruct:: KokkosBatched::TeamTrsm + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorTrsm + :members: +.. doxygenstruct:: KokkosBatched::Trsm + :members: + +innergemmfixa +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixA_Internal.hpp + +innergemmfixb +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixB_Internal.hpp + +innergemmfixc +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixC_Internal.hpp + +applyq +------ +.. doxygenstruct:: KokkosBatched::SerialApplyQ + :members: +.. doxygenstruct:: KokkosBatched::TeamApplyQ + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorApplyQ + :members: +.. doxygenstruct:: KokkosBatched::ApplyQ + :members: + +copy +---- +.. doxygenstruct:: KokkosBatched::SerialCopy + :members: +.. doxygenstruct:: KokkosBatched::TeamCopy + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorCopy + :members: +.. doxygenstruct:: KokkosBatched::Copy + :members: + +innertrsm +--------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerTrsm_Internal.hpp + +solvelu +------- +.. doxygenstruct:: KokkosBatched::SerialSolveLU + :members: +.. doxygenstruct:: KokkosBatched::TeamSolveLU + :members: +.. doxygenstruct:: KokkosBatched::SolveLU + :members: + +xpay +---- +.. doxygenstruct:: KokkosBatched::SerialXpay + :members: +.. doxygenstruct:: KokkosBatched::TeamXpay + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorXpay + :members: + +axpy +---- +.. doxygenstruct:: KokkosBatched::SerialAxpy + :members: +.. doxygenstruct:: KokkosBatched::TeamAxpy + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorAxpy + :members: + +gemv +---- +.. doxygenstruct:: KokkosBatched::SerialGemv + :members: +.. doxygenstruct:: KokkosBatched::TeamGemv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorGemv + :members: +.. doxygenstruct:: KokkosBatched::Gemv + :members: + +dot +--- +.. doxygenstruct:: KokkosBatched::SerialDot + :members: +.. doxygenstruct:: KokkosBatched::TeamDot + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorDot + :members: + +hadamardproduct +--------------- +.. doxygenstruct:: KokkosBatched::SerialHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::TeamHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::HadamardProduct + :members: + +vector +------ +CodeCleanup-TODO: Move Decl file to dense/impl/ + +trsv +---- +.. doxygenstruct:: KokkosBatched::SerialTrsv + :members: +.. doxygenstruct:: KokkosBatched::TeamTrsv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorTrsv + :members: +.. doxygenstruct:: KokkosBatched::Trsv + :members: + +gemm +---- +.. doxygenstruct:: KokkosBatched::SerialGemm :members: -.. doxygenclass:: KokkosBatched::TeamAxpby +.. doxygenstruct:: KokkosBatched::TeamGemm :members: -.. doxygenclass:: KokkosBatched::TeamVectorAxpby +.. doxygenstruct:: KokkosBatched::TeamVectorGemm :members: +.. doxygenstruct:: KokkosBatched::Gemm + :members: \ No newline at end of file diff --git a/docs/developer/apidocs/batched_sparse.rst b/docs/developer/apidocs/batched_sparse.rst index 38592c90fd..48031bc550 100644 --- a/docs/developer/apidocs/batched_sparse.rst +++ b/docs/developer/apidocs/batched_sparse.rst @@ -1,5 +1,43 @@ SPARSE BATCHED -- KokkosKernels sparse batched functor-level interfaces ======================================================================= -.. doxygenclass:: KokkosBatched::CG +cg +-- +.. doxygenstruct:: KokkosBatched::CG + :members: + +crsmatrix +--------- +.. doxygenclass:: KokkosBatched::CrsMatrix + :members: + +gmres +----- +.. doxygenstruct:: KokkosBatched::GMRES + :members: + +identity +-------- +.. doxygenclass:: KokkosBatched::Identity + :members: + +jacobiprec +---------- +.. doxygenclass:: KokkosBatched::JacobiPrec + :members: + +krylovhandle +------------ +.. doxygenclass:: KokkosBatched::KrylovHandle + :members: + +spmv +---- +.. doxygenstruct:: KokkosBatched::SerialSpmv + :members: +.. doxygenstruct:: KokkosBatched::TeamSpmv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorSpmv + :members: +.. doxygenstruct:: KokkosBatched::Spmv :members: \ No newline at end of file diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst index c6c65b1632..bfeb7fd1bb 100644 --- a/docs/developer/apidocs/blas1.rst +++ b/docs/developer/apidocs/blas1.rst @@ -1,20 +1,55 @@ BLAS1 -- KokkosKernels blas1 interfaces ======================================= +axpby +----- .. doxygenfunction:: KokkosBlas::axpby + +dot +--- .. doxygenfunction:: KokkosBlas::dot(const RV &, const XMV &, const YMV &, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::dot(const XVector &, const YVector &) + +fill +---- .. doxygenfunction:: KokkosBlas::fill + +mult +---- .. doxygenfunction:: KokkosBlas::mult + +nrm1 +---- .. doxygenfunction:: KokkosBlas::nrm1(const RV &, const XMV &, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::nrm1(const XVector &) + +nrm2 +---- .. doxygenfunction:: KokkosBlas::nrm2(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::nrm2(const XVector &x) + +nrm2w +----- .. doxygenfunction:: KokkosBlas::nrm2w(const RV &R, const XMV &X, const XMV &W, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::nrm2w(const XVector &x, const XVector &w) + +nrminf +------ .. doxygenfunction:: KokkosBlas::nrminf(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) .. doxygenfunction:: KokkosBlas::nrminf(const XVector &x) + +reciprocal +---------- .. doxygenfunction:: KokkosBlas::reciprocal + +scal +---- .. doxygenfunction:: KokkosBlas::scal + +sum +--- .. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) + +update +------ .. doxygenfunction:: KokkosBlas::update diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst index bd7f3c8d70..1d9a3f3fa7 100644 --- a/docs/developer/apidocs/blas2.rst +++ b/docs/developer/apidocs/blas2.rst @@ -1,4 +1,7 @@ BLAS2 -- KokkosKernels blas2 interfaces ======================================= -.. doxygenfunction:: KokkosBlas::gemv +gemv +---- +.. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) +.. doxygenfunction:: KokkosBlas::gemv(const typename AViewType::execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst index 8dda488a74..810b28a5a3 100644 --- a/docs/developer/apidocs/blas3.rst +++ b/docs/developer/apidocs/blas3.rst @@ -1,4 +1,8 @@ BLAS3 -- KokkosKernels blas3 interfaces ======================================= -.. doxygenfunction:: KokkosBlas::gemm +gemm +---- +.. doxygenfunction:: KokkosBlas::gemm(const char transA, const char transB, AMat::const_value_type alpha, const AMat &a, const BMat &b, CMat::const_value_type beta, const CMat &c) +.. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) +.. doxygenfunction:: KokkosBlas::gemm(const typename CViewType::execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index e1bd74babb..84ec48a519 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -1,9 +1,27 @@ SPARSE -- KokkosKernels sparse interfaces ========================================= +crsmatrix +--------- .. doxygenclass:: KokkosSparse::CrsMatrix :members: -.. doxygenfunction:: KokkosSparse::spmv + +spmv +---- +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) + +trsv +---- .. doxygenfunction:: KokkosSparse::trsv + +spgemm +------ .. doxygenfunction:: KokkosSparse::spgemm + +gauss +----- .. doxygenfunction:: KokkosSparse::gauss diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst index 495e8cbd94..0b02ebf190 100644 --- a/docs/developer/contrib.rst +++ b/docs/developer/contrib.rst @@ -4,7 +4,7 @@ Contributing Comment Style ------------- We follow doxygen style comments for both external (API) and internal members. See https://www.doxygen.nl/manual/docblocks.html for details. -Our documentation can be generated using the `-DKokkosKernels_ENABLE_DOCS:BOOL=ON` cmake flag. +Our documentation can be generated using the `-DKokkosKernels_ENABLE_DOCS:BOOL=ON` cmake flag; see `Building the Documentation`. In general, we prefer that the prototype has the doxygen style comment rather than the definition. If there is no prototype, then the definition should have the doxygen style comment. diff --git a/docs/developer/index.rst b/docs/developer/index.rst index d45eb38474..7ee05f98ae 100644 --- a/docs/developer/index.rst +++ b/docs/developer/index.rst @@ -2,7 +2,7 @@ Developer Manual ================ .. toctree:: - :maxdepth: 2 + :maxdepth: 1 Source Code Documentation Building the Documentation From 6423bf011072042ba6c844f2f7caa412030365e2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Mar 2022 15:51:27 -0600 Subject: [PATCH 072/261] Add build_doc.rst --- docs/developer/build_doc.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 docs/developer/build_doc.rst diff --git a/docs/developer/build_doc.rst b/docs/developer/build_doc.rst new file mode 100644 index 0000000000..dd3d357286 --- /dev/null +++ b/docs/developer/build_doc.rst @@ -0,0 +1,18 @@ +Building Developer Documentation +================================ + +.. code-block:: + :caption: Installing dependencies on MacOS + + brew install doxygen + pip install sphinx + pip install breathe + pip install sphinx-rtd-theme + +.. code-block:: + :caption: How to build developer documentation + + cmake -DKokkosKernels_ENABLE_DOCS:BOOL=ON /path/to/kokkos-kernels + make Doxygen + make Sphinx + open build/docs/docs/sphinx/index.html \ No newline at end of file From 6cf17d027057ab04f15f3a3cb354c56cbe18e46b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 25 Mar 2022 10:31:07 -0600 Subject: [PATCH 073/261] Minor updates to cluster Gauss-Seidel - Add necessary fences before timing blocks - Remove unnecessary fences in apply - Change inner apply loop to only access cluster begin/end once --- ...KokkosSparse_cluster_gauss_seidel_impl.hpp | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 60a00bd36a..bb95eea101 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -337,9 +337,13 @@ class ClusterGaussSeidel { (teamMember.league_rank() * _clusters_per_team) + work; if (ii >= _color_set_end) return; - nnz_lno_t cluster = _color_adj(ii); - for (nnz_lno_t j = _cluster_offsets(cluster); - j < _cluster_offsets(cluster + 1); j++) { + nnz_lno_t cluster = _color_adj(ii); + nnz_lno_t clusterBegin = _cluster_offsets(cluster); + nnz_lno_t clusterEnd = _cluster_offsets(cluster + 1); + for (nnz_lno_t jcount = 0; jcount < clusterEnd - clusterBegin; + jcount++) { + nnz_lno_t j = _is_backward ? (clusterEnd - 1 - jcount) + : clusterBegin + jcount; nnz_lno_t row = _cluster_verts(j); nnz_lno_t num_vecs = _Xvector.extent(1); for (nnz_lno_t batch_start = 0; batch_start < num_vecs;) { @@ -352,14 +356,10 @@ class ClusterGaussSeidel { COL_BATCH_CASE(1) COL_BATCH_CASE(2) COL_BATCH_CASE(3) - COL_BATCH_CASE(4) - COL_BATCH_CASE(5) - COL_BATCH_CASE(6) - COL_BATCH_CASE(7) #undef COL_BATCH_CASE default: - runColBatch<8>(teamMember, row, batch_start); - batch_start += 8; + runColBatch<4>(teamMember, row, batch_start); + batch_start += 4; } } } @@ -561,6 +561,7 @@ class ClusterGaussSeidel { in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>( num_rows, this->row_map, this->entries, sym_xadj, sym_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl; timer.reset(); #endif @@ -607,6 +608,7 @@ class ClusterGaussSeidel { " is not implemented"); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -620,6 +622,7 @@ class ClusterGaussSeidel { raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -668,6 +671,7 @@ class ClusterGaussSeidel { kh.destroy_graph_coloring_handle(); #endif #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Coloring: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -677,8 +681,8 @@ class ClusterGaussSeidel { typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( numClusters, numColors, colors, color_xadj, color_adj); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -798,8 +802,8 @@ class ClusterGaussSeidel { } gsHandle->set_inverse_diagonal(inverse_diagonal); gsHandle->set_call_numeric(true); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -861,7 +865,6 @@ class ClusterGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); } - MyExecSpace().fence(); } template @@ -894,7 +897,6 @@ class ClusterGaussSeidel { gs._clusters_per_team, team_size, vec_size), gs); - MyExecSpace().fence(); } } if (apply_backward) { @@ -913,7 +915,6 @@ class ClusterGaussSeidel { gs._clusters_per_team, team_size, vec_size), gs); - MyExecSpace().fence(); if (i == 0) { break; } @@ -945,7 +946,6 @@ class ClusterGaussSeidel { Kokkos::RangePolicy( 0, color_index_end - color_index_begin), gs); - MyExecSpace().fence(); } } if (apply_backward && numColors) { @@ -958,7 +958,6 @@ class ClusterGaussSeidel { Kokkos::RangePolicy( 0, color_index_end - color_index_begin), gs); - MyExecSpace().fence(); if (i == 0) { break; } From abfc89ab7fbf0d23848df9d564b92b3aeb974276 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 28 Mar 2022 09:52:19 -0600 Subject: [PATCH 074/261] sparse: Remove csc2csr copy by reference. --- src/sparse/KokkosSparse_csc2csr.hpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 90cee9b51c..5b85671587 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -132,21 +132,21 @@ class Csc2Csr { OrdinalType __nrows; OrdinalType __ncols; SizeType __nnz; - ValViewType &__vals; - CrsValsViewType &__crs_vals; - RowIdViewType &__row_ids; - CrsRowMapViewType &__crs_row_map; - CrsRowMapViewType &__crs_row_map_scratch; - ColMapViewType &__col_map; - CrsColIdViewType &__crs_col_ids; - RowIdViewType &__crs_row_cnt; + ValViewType __vals; + CrsValsViewType __crs_vals; + RowIdViewType __row_ids; + CrsRowMapViewType __crs_row_map; + CrsRowMapViewType __crs_row_map_scratch; + ColMapViewType __col_map; + CrsColIdViewType __crs_col_ids; + RowIdViewType __crs_row_cnt; public: __Functor(OrdinalType nrows, OrdinalType ncols, SizeType nnz, - ValViewType &vals, CrsValsViewType &crs_vals, - RowIdViewType &row_ids, CrsRowMapViewType &crs_row_map, - CrsRowMapViewType &crs_row_map_scratch, ColMapViewType &col_map, - CrsColIdViewType &crs_col_ids, RowIdViewType &crs_row_cnt) + ValViewType vals, CrsValsViewType crs_vals, RowIdViewType row_ids, + CrsRowMapViewType crs_row_map, + CrsRowMapViewType crs_row_map_scratch, ColMapViewType col_map, + CrsColIdViewType crs_col_ids, RowIdViewType crs_row_cnt) : __nrows(nrows), __ncols(ncols), __nnz(nnz), From 5521edbba44733afab1b1f0d257497575dd9f75f Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Fri, 1 Apr 2022 16:22:09 -0600 Subject: [PATCH 075/261] Fixes code deprecation warnings. --- src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index ebd6ce8993..4c09b8bf4e 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -136,13 +136,13 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, size_t bufferSize = 0; void* dBuffer = NULL; - cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") - alg = CUSPARSE_MV_ALG_DEFAULT; + alg = CUSPARSE_SPMV_ALG_DEFAULT; else if (algName == "merge") - alg = CUSPARSE_CSRMV_ALG2; + alg = CUSPARSE_SPMV_CSR_ALG2; } KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta, From 2fa4766a08c8a91ff7f23d0da7b038f700ca0379 Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Fri, 1 Apr 2022 16:45:59 -0600 Subject: [PATCH 076/261] Fixed one more. --- perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index ca16f2067e..20a0c7429f 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -605,7 +605,7 @@ int main(int argc, char** argv) { const double alpha = 1.0, beta = 1.0; size_t bufferSize = 0; void* dBuffer = NULL; - cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, alg, From a867e5c8dccddc4745091bb7b1e5396ebe2ae20f Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Wed, 6 Apr 2022 11:33:03 -0600 Subject: [PATCH 077/261] Fixed for different CuSparse versions. --- perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp | 4 ++++ src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 20a0c7429f..92924e7b5c 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -605,7 +605,11 @@ int main(int argc, char** argv) { const double alpha = 1.0, beta = 1.0; size_t bufferSize = 0; void* dBuffer = NULL; +#if CUSPARSE_VERSION >= 11201 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else + cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; +#endif KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, alg, diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 4c09b8bf4e..fc3573d910 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -136,11 +136,19 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, size_t bufferSize = 0; void* dBuffer = NULL; +#if CUSPARSE_VERSION >= 11201 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else + cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; +#endif if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") - alg = CUSPARSE_SPMV_ALG_DEFAULT; +#if CUSPARSE_VERSION >= 11201 + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else + cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; +#endif else if (algName == "merge") alg = CUSPARSE_SPMV_CSR_ALG2; } From d701ac1665035743a98ac5d8fb3a9616d4e8cd9d Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Wed, 6 Apr 2022 13:11:37 -0600 Subject: [PATCH 078/261] Formatting changes. --- perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp | 6 +++--- src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index 92924e7b5c..c578c269f8 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -603,11 +603,11 @@ int main(int argc, char** argv) { &vecY, y1.extent_int(0), (void*)y1.data(), myCudaDataType)); const double alpha = 1.0, beta = 1.0; - size_t bufferSize = 0; - void* dBuffer = NULL; + size_t bufferSize = 0; + void* dBuffer = NULL; #if CUSPARSE_VERSION >= 11201 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; -#else +#else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; #endif KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index fc3573d910..f43f36fa18 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -134,11 +134,11 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType)); - size_t bufferSize = 0; - void* dBuffer = NULL; + size_t bufferSize = 0; + void* dBuffer = NULL; #if CUSPARSE_VERSION >= 11201 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; -#else +#else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; #endif if (controls.isParameter("algorithm")) { @@ -146,7 +146,7 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, if (algName == "default") #if CUSPARSE_VERSION >= 11201 cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; -#else +#else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; #endif else if (algName == "merge") From cb301341aeb052995014af0e06d26adb6219dd02 Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Wed, 6 Apr 2022 13:36:51 -0600 Subject: [PATCH 079/261] Final fixes to SpMV macro for CuSparse. --- src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index f43f36fa18..d6f36c0a2b 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -145,12 +145,16 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, const std::string algName = controls.getParameter("algorithm"); if (algName == "default") #if CUSPARSE_VERSION >= 11201 - cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; + alg = CUSPARSE_SPMV_ALG_DEFAULT; #else - cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; + alg = CUSPARSE_MV_ALG_DEFAULT; #endif else if (algName == "merge") +#if CUSPARSE_VERSION >= 11201 alg = CUSPARSE_SPMV_CSR_ALG2; +#else + alg = CUSPARSE_CSRMV_ALG2; +#endif } KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta, From e8dd918e89d2f063255fa126c827965260cd5ebd Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Thu, 7 Apr 2022 10:48:48 -0600 Subject: [PATCH 080/261] Fix unused parameter warnings. --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b9cff5e5e4..2d87567c6f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1775,6 +1775,11 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, Kokkos::deep_copy(dst, h_dst); Kokkos::fence(); #else + // Avoid unused parameter warnings: + (void)src; + (void)dst; + (void)options; + Kokkos::abort( "Cannot perform simd verification with cuda/10.2.2, rerun with -v 0"); #endif // #if (CUDA_VERSION != 10020) From 99f91e48ea73e90e74cd50db10437ccca4bd4b61 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Wed, 13 Apr 2022 09:52:38 -0400 Subject: [PATCH 081/261] Value-initialize result of MaxLoc reduction to avoid maybe uninitialized warning --- src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index f11210253e..32980219bf 100644 --- a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -43,7 +43,7 @@ struct TeamVectorFindAmaxInternal { if (m > 0) { using reducer_value_type = typename Kokkos::MaxLoc::value_type; - reducer_value_type value; + reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, m), From 2221b2c184fd3d97c48dae23df9c86ff06593537 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 1 Apr 2022 17:09:42 -0600 Subject: [PATCH 082/261] sptrsv: improve symbolic level scheduling time Use level scheduling implementation like that from spiluk Co-author: Vinh Dang @vqd8a --- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 157 ++++++------------ 1 file changed, 47 insertions(+), 110 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 4d11112493..1d4be5be08 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -223,65 +223,32 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - HostSignedEntriesType previous_level_list( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), - nrows); - Kokkos::deep_copy(previous_level_list, signed_integral_t(-1)); - - const bool stored_diagonal = thandle.is_stored_diagonal(); - // diagonal_offsets is uninitialized - deep_copy unnecessary at the - // beginning, only needed at the end - auto diagonal_offsets = thandle.get_diagonal_offsets(); - auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - - size_type level = 0; - auto starting_node = 0; - auto ending_node = nrows; - + signed_integral_t level = 0; size_type node_count = 0; - while (node_count < nrows) { - for (size_type row = starting_node; row < ending_node; ++row) { - if (level_list(row) == -1) { // unmarked - bool is_root = true; - signed_integral_t ptrstart = row_map(row); - signed_integral_t ptrend = row_map(row + 1); - - for (signed_integral_t offset = ptrstart; offset < ptrend; ++offset) { - size_type col = entries(offset); - if (previous_level_list(col) == -1 && col != row) { // unmarked - if (col < row) { - is_root = false; - break; - } - } else if (col == row) { - if (stored_diagonal) hdiagonal_offsets(row) = offset; - } else if (col > row) { - std::cout << "\nrow = " << row << " col = " << col - << " offset = " << offset << std::endl; - throw( - std::runtime_error("SYMB ERROR: Lower tri with colid > rowid " - "- SHOULD NOT HAPPEN!!!")); - } - } // end for offset , i.e. cols of this row - - if (is_root == true) { - level_list(row) = level; - nodes_per_level(level) += 1; - nodes_grouped_by_level(node_count) = row; - node_count += 1; - } - - } // end if - } // end for row - - // Kokkos::deep_copy(previous_level_list, level_list); - for (size_type i = 0; i < nrows; ++i) { - previous_level_list(i) = level_list(i); + typename DeviceEntriesType::HostMirror level_ptr("lp", nrows+1); // temp View used for index bookkeeping + level_ptr(0) = 0; + for (size_type i = 0; i < nrows; ++i) { + signed_integral_t l = 0; + size_type rowstart = row_map(i); + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; j++) { + size_type col = entries(j); + l = std::max(l, level_list(col)); } - - level += 1; - } // end while + level_list(i) = l + 1; + nodes_per_level(l) += 1; // 0-based indexing + level_ptr(l + 1) += 1; + level = std::max(level, l + 1); + node_count++; + } + for (size_type i = 1; i <= level; ++i) { + level_ptr(i) += level_ptr(i - 1); + } + for (size_type i = 0; i < nrows; i++) { + nodes_grouped_by_level(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; + } thandle.set_num_levels(level); @@ -320,7 +287,6 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -705,61 +671,33 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - HostSignedEntriesType previous_level_list( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "previous_level_list"), - nrows); - Kokkos::deep_copy(previous_level_list, signed_integral_t(-1)); - - const bool stored_diagonal = thandle.is_stored_diagonal(); - // diagonal_offsets is uninitialized - deep_copy unnecessary at the - // beginning, only needed at the end - auto diagonal_offsets = thandle.get_diagonal_offsets(); - auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - - size_type level = 0; - auto starting_node = nrows - 1; - auto ending_node = 0; - + signed_integral_t level = 0; size_type node_count = 0; - while (node_count < nrows) { - for (signed_integral_t row = starting_node; row >= ending_node; --row) { - if (level_list(row) == -1) { // unmarked - bool is_root = true; - signed_integral_t ptrstart = row_map(row); - signed_integral_t ptrend = row_map(row + 1); - - for (signed_integral_t offset = ptrend - 1; offset >= ptrstart; - --offset) { - signed_integral_t col = entries(offset); - - if (previous_level_list(col) == -1 && col != row) { // unmarked - if (col > row) { - is_root = false; - break; - } - } else if (col == row) { - if (stored_diagonal) hdiagonal_offsets(row) = offset; - } - } // end for offset , i.e. cols of this row - - if (is_root == true) { - level_list(row) = level; - nodes_per_level(level) += 1; - nodes_grouped_by_level(node_count) = row; - node_count += 1; - } - - } // end if - } // end for row - - // Kokkos::deep_copy(previous_level_list, level_list); - for (size_type i = 0; i < nrows; ++i) { - previous_level_list(i) = level_list(i); + typename DeviceEntriesType::HostMirror level_ptr("lp", nrows+1); // temp View used for index bookkeeping + level_ptr(0) = 0; + for (size_type ii = nrows; ii > 0 ; ii--) { + size_type i = ii-1; // Avoid >= 0 comparison in for-loop to prevent wraparound errors with unsigned types + signed_integral_t l = 0; + size_type rowstart = row_map(i)+1; // skip diag + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; ++j) { + size_type col = entries(j); + l = std::max(l, level_list(col)); } - - level += 1; - } // end while + level_list(i) = l + 1; + nodes_per_level(l) += 1; // 0-based indexing + level_ptr(l + 1) += 1; + level = std::max(level, l + 1); + node_count++; + } + for (size_type i = 1; i <= level; ++i) { + level_ptr(i) += level_ptr(i - 1); + } + for (size_type i = 0; i < nrows; i++) { + nodes_grouped_by_level(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; + } thandle.set_num_levels(level); @@ -798,7 +736,6 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - if (stored_diagonal) Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); // Extra check: #ifdef LVL_OUTPUT_INFO From e0a391441a7a8c4e7a932892e222771fddd56f1f Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 1 Apr 2022 17:13:04 -0600 Subject: [PATCH 083/261] Apply clang-format-8 --- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 39 ++++++++++--------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 1d4be5be08..ba339d26a8 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -223,21 +223,22 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - signed_integral_t level = 0; - size_type node_count = 0; + signed_integral_t level = 0; + size_type node_count = 0; - typename DeviceEntriesType::HostMirror level_ptr("lp", nrows+1); // temp View used for index bookkeeping + typename DeviceEntriesType::HostMirror level_ptr( + "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; for (size_type i = 0; i < nrows; ++i) { - signed_integral_t l = 0; - size_type rowstart = row_map(i); - size_type rowend = row_map(i + 1); + signed_integral_t l = 0; + size_type rowstart = row_map(i); + size_type rowend = row_map(i + 1); for (size_type j = rowstart; j < rowend; j++) { size_type col = entries(j); l = std::max(l, level_list(col)); } level_list(i) = l + 1; - nodes_per_level(l) += 1; // 0-based indexing + nodes_per_level(l) += 1; // 0-based indexing level_ptr(l + 1) += 1; level = std::max(level, l + 1); node_count++; @@ -288,7 +289,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - // Extra check: + // Extra check: #ifdef LVL_OUTPUT_INFO { std::cout << " End symb - extra checks" << std::endl; @@ -671,22 +672,24 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); Kokkos::deep_copy(level_list, dlevel_list); - signed_integral_t level = 0; - size_type node_count = 0; + signed_integral_t level = 0; + size_type node_count = 0; - typename DeviceEntriesType::HostMirror level_ptr("lp", nrows+1); // temp View used for index bookkeeping + typename DeviceEntriesType::HostMirror level_ptr( + "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; - for (size_type ii = nrows; ii > 0 ; ii--) { - size_type i = ii-1; // Avoid >= 0 comparison in for-loop to prevent wraparound errors with unsigned types - signed_integral_t l = 0; - size_type rowstart = row_map(i)+1; // skip diag - size_type rowend = row_map(i + 1); + for (size_type ii = nrows; ii > 0; ii--) { + size_type i = ii - 1; // Avoid >= 0 comparison in for-loop to prevent + // wraparound errors with unsigned types + signed_integral_t l = 0; + size_type rowstart = row_map(i) + 1; // skip diag + size_type rowend = row_map(i + 1); for (size_type j = rowstart; j < rowend; ++j) { size_type col = entries(j); l = std::max(l, level_list(col)); } level_list(i) = l + 1; - nodes_per_level(l) += 1; // 0-based indexing + nodes_per_level(l) += 1; // 0-based indexing level_ptr(l + 1) += 1; level = std::max(level, l + 1); node_count++; @@ -737,7 +740,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); - // Extra check: + // Extra check: #ifdef LVL_OUTPUT_INFO { std::cout << " End symb - extra checks" << std::endl; From 5a791be6cb22f5f4f388788fb90a610368880ccc Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 1 Apr 2022 19:27:47 -0600 Subject: [PATCH 084/261] Fix unsigned - signed comparison -Werror --- src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index ba339d26a8..3a6f988835 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -243,7 +243,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, level = std::max(level, l + 1); node_count++; } - for (size_type i = 1; i <= level; ++i) { + for (signed_integral_t i = 1; i <= level; ++i) { level_ptr(i) += level_ptr(i - 1); } for (size_type i = 0; i < nrows; i++) { @@ -694,7 +694,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, level = std::max(level, l + 1); node_count++; } - for (size_type i = 1; i <= level; ++i) { + for (signed_integral_t i = 1; i <= level; ++i) { level_ptr(i) += level_ptr(i - 1); } for (size_type i = 0; i < nrows; i++) { From f26957addc8e2b523442fe7d294b3f61597556b9 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Thu, 7 Apr 2022 12:04:45 -0600 Subject: [PATCH 085/261] Add batched GESV --- example/CMakeLists.txt | 1 + example/static_pivoting/CMakeLists.txt | 7 + example/static_pivoting/example.cpp | 177 ++++++ example/static_pivoting/examples_helper.hpp | 155 +++++ src/batched/dense/KokkosBatched_Gesv.hpp | 139 +++++ .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 548 ++++++++++++++++++ .../batched/dense/Test_Batched_Dense.hpp | 6 + .../batched/dense/Test_Batched_DenseUtils.hpp | 44 ++ .../batched/dense/Test_Batched_SerialGesv.hpp | 139 +++++ .../dense/Test_Batched_SerialGesv_Real.hpp | 11 + .../batched/dense/Test_Batched_TeamGesv.hpp | 149 +++++ .../dense/Test_Batched_TeamGesv_Real.hpp | 11 + .../dense/Test_Batched_TeamVectorGesv.hpp | 149 +++++ .../Test_Batched_TeamVectorGesv_Real.hpp | 11 + 14 files changed, 1547 insertions(+) create mode 100644 example/static_pivoting/CMakeLists.txt create mode 100644 example/static_pivoting/example.cpp create mode 100644 example/static_pivoting/examples_helper.hpp create mode 100644 src/batched/dense/KokkosBatched_Gesv.hpp create mode 100644 src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp create mode 100644 unit_test/batched/dense/Test_Batched_DenseUtils.hpp create mode 100644 unit_test/batched/dense/Test_Batched_SerialGesv.hpp create mode 100644 unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamGesv.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp create mode 100644 unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index a0c8c1f564..9dd8d09749 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -7,3 +7,4 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) #ADD_SUBDIRECTORY(graph) ADD_SUBDIRECTORY(wiki) ADD_SUBDIRECTORY(gmres) +ADD_SUBDIRECTORY(static_pivoting) diff --git a/example/static_pivoting/CMakeLists.txt b/example/static_pivoting/CMakeLists.txt new file mode 100644 index 0000000000..3bfc7e8d95 --- /dev/null +++ b/example/static_pivoting/CMakeLists.txt @@ -0,0 +1,7 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_ADD_EXECUTABLE( + static_pivoting + SOURCES example.cpp + ) \ No newline at end of file diff --git a/example/static_pivoting/example.cpp b/example/static_pivoting/example.cpp new file mode 100644 index 0000000000..b703cb74ad --- /dev/null +++ b/example/static_pivoting/example.cpp @@ -0,0 +1,177 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include + +#define KOKKOSKERNELS_DEBUG_LEVEL 0 + +#include "Kokkos_Core.hpp" +#include "Kokkos_Timer.hpp" +#include "Kokkos_Random.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include "Kokkos_Sort.hpp" + +/// KokkosKernels headers +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +#include +#include +#include "examples_helper.hpp" +#include +#include +#include +#include +#include +#include +#include "KokkosBatched_Gesv.hpp" + +typedef Kokkos::DefaultExecutionSpace exec_space; + +template +struct Functor_TeamTestStaticPivoting { + const AViewType _A; + const XYViewType _X; + const XYViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, + const XYViewType &Y) + : _A(A), _X(X), _Y(Y) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); + member.team_barrier(); + KokkosBatched::TeamGesv::invoke(member, A, X, Y); + member.team_barrier(); + } + + inline void run() { + std::string name("KokkosBatched::Test::StaticPivoting"); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_SerialTestStaticPivoting { + const AViewType _A; + const AViewType _tmp; + const XYViewType _X; + const XYViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, + const XYViewType &X, const XYViewType &Y) + : _A(A), _tmp(tmp), _X(X), _Y(Y) {} + + KOKKOS_INLINE_FUNCTION void operator()(const int &matrix_id) const { + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); + KokkosBatched::SerialGesv::invoke(A, X, Y, tmp); + } + + inline void run() { + std::string name("KokkosBatched::Test::StaticPivoting"); + + const int N = _A.extent(0); + Kokkos::parallel_for(name.c_str(), N, *this); + } +}; + +int main(int /*argc*/, char ** /*argv[]*/) { + Kokkos::initialize(); + { + using layout = Kokkos::LayoutLeft; + + using AViewType = Kokkos::View; + using XYViewType = Kokkos::View; + + int N = 1; + int n = 10; + + AViewType A("A", N, n, n); + AViewType tmp("tmp", N, n, n + 4); + XYViewType X("X", N, n); + XYViewType Y("Y", N, n); + + create_saddle_point_matrices(A, Y); + + // The matrices are modified by the GESV so we have to copy them if we want + // to solve the same systems twice. + AViewType A2("A2", N, n, n); + XYViewType Y2("Y2", N, n); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(Y2, Y); + + write3DArrayToMM("A.mm", A); + write2DArrayToMM("Y.mm", Y); + + Functor_SerialTestStaticPivoting(A, tmp, + X, Y) + .run(); + write2DArrayToMM("X_serial.mm", X); + Functor_TeamTestStaticPivoting(A2, X, Y2) + .run(); + write2DArrayToMM("X_team.mm", X); + } + Kokkos::finalize(); +} diff --git a/example/static_pivoting/examples_helper.hpp b/example/static_pivoting/examples_helper.hpp new file mode 100644 index 0000000000..c9b5963c55 --- /dev/null +++ b/example/static_pivoting/examples_helper.hpp @@ -0,0 +1,155 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +template +void write2DArrayToMM(std::string name, const XType x) { + std::ofstream myfile; + myfile.open(name); + + auto x_h = Kokkos::create_mirror_view(x); + + Kokkos::deep_copy(x_h, x); + + if (XType::Rank == 2) { + myfile << "%% MatrixMarket 2D Array\n%" << std::endl; + myfile << x_h.extent(0) << " " << x_h.extent(1) << std::endl; + + for (size_t i = 0; i < x_h.extent(0); ++i) { + for (size_t j = 0; j < x_h.extent(1); ++j) { + myfile << std::setprecision(15) << x_h(i, j) << " "; + } + myfile << std::endl; + } + + myfile.close(); + } +} + +template +void write3DArrayToMM(std::string name, const XType x) { + std::ofstream myfile; + myfile.open(name); + + auto x_h = Kokkos::create_mirror_view(x); + + Kokkos::deep_copy(x_h, x); + + if (XType::Rank == 3) { + myfile << "%% MatrixMarket 3D Array\n%" << std::endl; + myfile << x_h.extent(0) << " " << x_h.extent(1) << " " << x_h.extent(2) + << std::endl; + + for (size_t i = 0; i < x_h.extent(0); ++i) { + myfile << "Slice " << i << std::endl; + for (size_t j = 0; j < x_h.extent(1); ++j) { + for (size_t k = 0; k < x_h.extent(2); ++k) { + myfile << std::setprecision(15) << x_h(i, j, k) << " "; + } + myfile << std::endl; + } + } + + myfile.close(); + } +} + +template +void create_saddle_point_matrices(const MatrixViewType &A, + const VectorViewType &Y, const int n_2 = 4) { + Kokkos::Random_XorShift64_Pool< + typename MatrixViewType::device_type::execution_space> + random(13718); + const int N = A.extent(0); + const int n = A.extent(1); + const int n_1 = n - n_2; + + const int n_dim = n_2 - 1; + MatrixViewType xs("xs", N, n_1, n_dim); + VectorViewType ys("ys", N, n_1); + + Kokkos::fill_random( + xs, random, + Kokkos::reduction_identity::prod()); + Kokkos::fill_random( + ys, random, + Kokkos::reduction_identity::prod()); + + auto xs_host = Kokkos::create_mirror_view(xs); + auto ys_host = Kokkos::create_mirror_view(ys); + auto A_host = Kokkos::create_mirror_view(A); + auto Y_host = Kokkos::create_mirror_view(Y); + + Kokkos::deep_copy(xs_host, xs); + Kokkos::deep_copy(ys_host, ys); + + for (int i = 0; i < n_1; ++i) { + for (int j = 0; j < n_1; ++j) { + for (int l = 0; l < N; ++l) { + auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); + auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); + typename MatrixViewType::value_type d = 0; + for (int k = 0; k < n_dim; ++k) d += Kokkos::pow(xs_i(k) - xs_j(k), 2); + d = Kokkos::sqrt(d); + A_host(l, i, j) = Kokkos::pow(d, 5); + } + } + for (int l = 0; l < N; ++l) { + A_host(l, i, n_1) = (typename MatrixViewType::value_type)1.0; + A_host(l, n_1, i) = (typename MatrixViewType::value_type)1.0; + for (int k = 0; k < n_dim; ++k) { + A_host(l, i, n_1 + k + 1) = xs_host(l, i, k); + A_host(l, n_1 + k + 1, i) = xs_host(l, i, k); + } + Y_host(l, i) = ys_host(l, i); + } + } + for (int i = n_1; i < n; ++i) { + for (int l = 0; l < N; ++l) { + Y_host(l, i) = (typename MatrixViewType::value_type)0.0; + } + } + + Kokkos::deep_copy(A, A_host); + Kokkos::deep_copy(Y, Y_host); + + Kokkos::fence(); +} \ No newline at end of file diff --git a/src/batched/dense/KokkosBatched_Gesv.hpp b/src/batched/dense/KokkosBatched_Gesv.hpp new file mode 100644 index 0000000000..c0affa5fdf --- /dev/null +++ b/src/batched/dense/KokkosBatched_Gesv.hpp @@ -0,0 +1,139 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GESV_HPP__ +#define __KOKKOSBATCHED_GESV_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + +/// \brief Serial Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 2D view +/// +/// \param A [in]: batched matrix, a rank 3 view +/// \param X [out]: solution, a rank 2 view +/// \param B [in]: right-hand side, a rank 2 view +/// \param tmp [in]: a rank 3 view used to store temporary variable; dimension +/// must be N x n x (n+4) where N is the batched size and n is the number of +/// rows. +/// +/// No nested parallel_for is used inside of the function. +/// + +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType tmp); +}; + +/// \brief Team Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 2D view +/// +/// \param member [in]: TeamPolicy member +/// \param A [in]: batched matrix, a rank 3 view +/// \param X [out]: solution, a rank 2 view +/// \param B [in]: right-hand side, a rank 2 view +/// +/// A nested parallel_for with TeamThreadRange is used. +/// + +template +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y); +}; + +/// \brief Team Vector Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 2D view +/// +/// \param member [in]: TeamPolicy member +/// \param A [in]: batched matrix, a rank 3 view +/// \param X [out]: solution, a rank 2 view +/// \param B [in]: right-hand side, a rank 2 view +/// +/// Two nested parallel_for with both TeamVectorRange and ThreadVectorRange +/// (or one with TeamVectorRange) are used inside. +/// + +template +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Gesv_Impl.hpp" + +#endif diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp new file mode 100644 index 0000000000..20bf334304 --- /dev/null +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -0,0 +1,548 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GESV_IMPL_HPP__ +#define __KOKKOSBATCHED_GESV_IMPL_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include +#include "KokkosBatched_Trsm_Decl.hpp" + +namespace KokkosBatched { + +struct SerialStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2); +}; + +template +struct TeamStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); +}; + +template +struct TeamVectorStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static void invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); +}; + +template +KOKKOS_INLINE_FUNCTION void SerialStaticPivoting::invoke( + const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + const int n = A.extent(0); + + for (int i = 0; i < n; ++i) { + D2(i) = 0.; + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + for (int j = 0; j < n; ++j) { + if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); + if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); + } + D2(i) = 1. / D2(i); + } + + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + A(i, j) *= D2(j); + } + } + + for (int i = 0; i < n; ++i) { + value_type D1_i = 0.; + for (int j = 0; j < n; ++j) { + if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); + } + D1_i = 1. / D1_i; + for (int j = 0; j < n; ++j) { + A(i, j) *= D1_i; + } + Y(i) *= D1_i; + } + + for (int i = 0; i < n; ++i) { + int row_index = 0; + int col_index = 0; + value_type tmp_0 = 0.; + value_type tmp_1 = 0.; + for (int j = 0; j < n; ++j) { + if (tmp_0 < tmp_v_1(j)) { + tmp_0 = tmp_v_1(j); + row_index = j; + } + } + for (int j = 0; j < n; ++j) { + if (tmp_1 < Kokkos::abs(A(row_index, j) * tmp_v_2(j))) { + tmp_1 = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + col_index = j; + } + } + tmp_v_1(row_index) = 0.; + tmp_v_2(col_index) = 0.; + + for (int j = 0; j < n; ++j) { + PDAD(col_index, j) = A(row_index, j); + } + PDY(col_index) = Y(row_index); + } +} + +template +template +KOKKOS_INLINE_FUNCTION void TeamStaticPivoting::invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = + typename Kokkos::MaxLoc::value_type; + // Made this non-const in order to WORKAROUND issue #349 (Credit to C. Trott) + int n = A.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + D2(i) = 0.; + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + for (int j = 0; j < n; ++j) { + if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); + if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); + } + D2(i) = 1. / D2(i); + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + for (int j = 0; j < n; ++j) { + A(i, j) *= D2(j); + } + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + value_type D1_i = 0.; + for (int j = 0; j < n; ++j) { + if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); + } + D1_i = 1. / D1_i; + for (int j = 0; j < n; ++j) { + A(i, j) *= D1_i; + } + Y(i) *= D1_i; + }); + + for (int i = 0; i < n; ++i) { + int row_index, col_index; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (tmp_v_1(j) > update.val) { + update.val = tmp_v_1(j); + update.loc = j; + } + }, + reducer_value); + row_index = value.loc; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) { + update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + update.loc = j; + } + }, + reducer_value); + col_index = value.loc; + tmp_v_1(row_index) = 0.; + tmp_v_2(col_index) = 0.; + + for (int j = 0; j < n; ++j) { + PDAD(col_index, j) = A(row_index, j); + } + PDY(col_index) = Y(row_index); + } +} + +template +template +KOKKOS_INLINE_FUNCTION void TeamVectorStaticPivoting::invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = + typename Kokkos::MaxLoc::value_type; + const int n = A.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + D2(i) = 0.; + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(j, i)) > update.val) { + update.val = Kokkos::abs(A(j, i)); + update.loc = j; + } + }, + reducer_value); + D2(i) = 1. / value.val; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(i, j)) > update.val) { + update.val = Kokkos::abs(A(i, j)); + update.loc = j; + } + }, + reducer_value); + tmp_v_1(i) = value.val; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A(i, j) *= D2(j); }); + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + value_type D1_i = 0.; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(i, j)) > update.val) { + update.val = Kokkos::abs(A(i, j)); + update.loc = j; + } + }, + reducer_value); + D1_i = 1. / value.val; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A(i, j) *= D1_i; }); + Y(i) *= D1_i; + }); + + for (int i = 0; i < n; ++i) { + int row_index, col_index; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (tmp_v_1(j) > update.val) { + update.val = tmp_v_1(j); + update.loc = j; + } + }, + reducer_value); + row_index = value.loc; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) { + update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + update.loc = j; + } + }, + reducer_value); + col_index = value.loc; + tmp_v_1(row_index) = 0.; + tmp_v_2(col_index) = 0.; + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + PDAD(col_index, j) = A(row_index, j); + }); + PDY(col_index) = Y(row_index); + } +} + +template +KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const int n = X.extent(0); + + for (int i = 0; i < n; ++i) { + DX(i) = D(i) * X(i); + } +} + +template +KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, + const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const int n = X.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const int &i) { DX(i) = D(i) * X(i); }); +} + +template +KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, + const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const int n = X.extent(0); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const int &i) { DX(i) = D(i) * X(i); }); +} + +/// +/// Serial Impl +/// =========== +template +KOKKOS_INLINE_FUNCTION int SerialGesv::invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType tmp) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + + if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " + "%d x %d, tmp (note: its second dimension should be the second " + "dimension of A + 4): %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), + (int)tmp.extent(1)); + return 1; + } + + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); + return 1; + } +#endif + + const int n = A.extent(0); + + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2); + + SerialLU::invoke(PDAD); + + SerialTrsm::invoke(1.0, PDAD, PDY); + + SerialTrsm::invoke(1.0, PDAD, PDY); + + SerialHadamard1D(PDY, D2, X); + return 0; +} + +/// +/// Team Impl +/// ========= + +template +template +KOKKOS_INLINE_FUNCTION int TeamGesv::invoke( + const MemberType &member, const MatrixType A, const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = + Kokkos::View; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, + tmp_v_2); + member.team_barrier(); + + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, PDY); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, + PDY); + member.team_barrier(); + + TeamHadamard1D(member, PDY, D2, X); + member.team_barrier(); + return 0; +} + +/// +/// TeamVector Impl +/// ========= + +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorGesv::invoke( + const MemberType &member, const MatrixType A, const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = + Kokkos::View; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + TeamVectorStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, + tmp_v_1, tmp_v_2); + member.team_barrier(); + + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, PDY); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, + PDY); + member.team_barrier(); + + TeamVectorHadamard1D(member, PDY, D2, X); + member.team_barrier(); + return 0; +} + +} // namespace KokkosBatched + +#endif diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp index 47a1cf1fd4..57de7ebfdd 100644 --- a/unit_test/batched/dense/Test_Batched_Dense.hpp +++ b/unit_test/batched/dense/Test_Batched_Dense.hpp @@ -16,6 +16,8 @@ #include "Test_Batched_SerialGemv.hpp" #include "Test_Batched_SerialGemv_Real.hpp" #include "Test_Batched_SerialGemv_Complex.hpp" +#include "Test_Batched_SerialGesv.hpp" +#include "Test_Batched_SerialGesv_Real.hpp" #include "Test_Batched_SerialInverseLU.hpp" #include "Test_Batched_SerialInverseLU_Real.hpp" #include "Test_Batched_SerialInverseLU_Complex.hpp" @@ -52,6 +54,8 @@ #include "Test_Batched_TeamGemv.hpp" #include "Test_Batched_TeamGemv_Real.hpp" #include "Test_Batched_TeamGemv_Complex.hpp" +#include "Test_Batched_TeamGesv.hpp" +#include "Test_Batched_TeamGesv_Real.hpp" #include "Test_Batched_TeamInverseLU.hpp" #include "Test_Batched_TeamInverseLU_Real.hpp" #include "Test_Batched_TeamInverseLU_Complex.hpp" @@ -80,6 +84,8 @@ #include "Test_Batched_TeamVectorGemm.hpp" #include "Test_Batched_TeamVectorGemm_Real.hpp" #include "Test_Batched_TeamVectorGemm_Complex.hpp" +#include "Test_Batched_TeamVectorGesv.hpp" +#include "Test_Batched_TeamVectorGesv_Real.hpp" #include "Test_Batched_TeamVectorQR.hpp" #include "Test_Batched_TeamVectorQR_Real.hpp" #include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp" diff --git a/unit_test/batched/dense/Test_Batched_DenseUtils.hpp b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp new file mode 100644 index 0000000000..d355159a9a --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp @@ -0,0 +1,44 @@ +#ifndef TEST_BATCHED_DENSE_HELPER_HPP +#define TEST_BATCHED_DENSE_HELPER_HPP + +namespace KokkosBatched { +template +void create_tridiagonal_batched_matrices(const MatrixViewType &A, + const VectorViewType &B) { + Kokkos::Random_XorShift64_Pool< + typename VectorViewType::device_type::execution_space> + random(13718); + Kokkos::fill_random( + B, random, + Kokkos::reduction_identity::prod()); + + auto A_host = Kokkos::create_mirror_view(A); + + const int N = A.extent(0); + const int BlkSize = A.extent(1); + + for (int l = 0; l < N; ++l) { + for (int i = 0; i < BlkSize; ++i) { + for (int j = i; j < BlkSize; ++j) { + if (i == j) + A_host(l, i, j) = typename VectorViewType::value_type(2.0); + else if (i == j - 1) { + A_host(l, i, j) = typename VectorViewType::value_type(-1.0); + A_host(l, j, i) = typename VectorViewType::value_type(-1.0); + } else { + A_host(l, i, j) = typename VectorViewType::value_type(0.0); + A_host(l, j, i) = typename VectorViewType::value_type(0.0); + } + } + } + } + + Kokkos::fence(); + + Kokkos::deep_copy(A, A_host); + + Kokkos::fence(); +} +} // namespace KokkosBatched + +#endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp new file mode 100644 index 0000000000..15fe7dfacc --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp @@ -0,0 +1,139 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Gesv { + +template +struct Functor_TestBatchedSerialGesv { + const MatrixType _A; + const MatrixType _tmp; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, + const VectorType &X, const VectorType &B) + : _A(A), _tmp(tmp), _X(X), _B(B) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto A = Kokkos::subview(_A, k, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, k, Kokkos::ALL); + auto b = Kokkos::subview(_B, k, Kokkos::ALL); + auto tmp = Kokkos::subview(_tmp, k, Kokkos::ALL, Kokkos::ALL); + + KokkosBatched::SerialGesv::invoke(A, x, b, tmp); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), + tmp("tmp", N, BlkSize, BlkSize + 4); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedSerialGesv(A, tmp, X, + B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace Gesv +} // namespace Test + +template +int test_batched_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::Gesv::impl_test_batched_gesv( + 1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::Gesv::impl_test_batched_gesv( + 1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp new file mode 100644 index 0000000000..f8d391a428 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp @@ -0,0 +1,11 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_serial_gesv_float) { + test_batched_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_serial_gesv_double) { + test_batched_gesv(); +} +#endif \ No newline at end of file diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp new file mode 100644 index 0000000000..bdef5eb68d --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp @@ -0,0 +1,149 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamGesv { + +template +struct Functor_TestBatchedTeamGesv { + const MatrixType _A; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, + const VectorType &B) + : _A(A), _X(X), _B(B) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + + member.team_barrier(); + KokkosBatched::TeamGesv::invoke(member, A, x, b); + member.team_barrier(); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedTeamGesv(A, X, B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace TeamGesv +} // namespace Test + +template +int test_batched_team_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamGesv::impl_test_batched_gesv(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamGesv::impl_test_batched_gesv(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp new file mode 100644 index 0000000000..6b01a23d65 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp @@ -0,0 +1,11 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_team_gesv_float) { + test_batched_team_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_team_gesv_double) { + test_batched_team_gesv(); +} +#endif \ No newline at end of file diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp new file mode 100644 index 0000000000..beac7b2e45 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp @@ -0,0 +1,149 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamVectorGesv { + +template +struct Functor_TestBatchedTeamVectorGesv { + const MatrixType _A; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, + const VectorType &B) + : _A(A), _X(X), _B(B) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + + member.team_barrier(); + KokkosBatched::TeamVectorGesv::invoke(member, A, x, b); + member.team_barrier(); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVectorGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedTeamVectorGesv(A, X, B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace TeamVectorGesv +} // namespace Test + +template +int test_batched_teamvector_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp new file mode 100644 index 0000000000..a589f4aa2b --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp @@ -0,0 +1,11 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_teamvector_gesv_float) { + test_batched_teamvector_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_teamvector_gesv_double) { + test_batched_teamvector_gesv(); +} +#endif \ No newline at end of file From 03b88554a20b545621d6c6ee7a9df2a011083008 Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Tue, 19 Apr 2022 15:42:33 -0600 Subject: [PATCH 086/261] Add verbosity parameter to GMRES example. Turn off for testing. --- example/gmres/ex_real_A.cpp | 156 +++++++++++++++++-------------- example/gmres/gmres.hpp | 46 ++++++--- example/gmres/test_cmplx_A.cpp | 1 + example/gmres/test_prec.cpp | 166 +++++++++++++++++++-------------- example/gmres/test_real_A.cpp | 1 + 5 files changed, 214 insertions(+), 156 deletions(-) diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 2c119d2a9c..03eaeeff6e 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -42,31 +42,31 @@ //@HEADER */ -#include -#include"KokkosKernels_IOUtils.hpp" -#include -#include -#include -#include -#include +#include +#include "KokkosKernels_IOUtils.hpp" +#include +#include +#include +#include +#include -#include"gmres.hpp" +#include "gmres.hpp" -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) { typedef double ST; typedef int OT; - typedef Kokkos::DefaultExecutionSpace EXSP; + typedef Kokkos::DefaultExecutionSpace EXSP; - using ViewVectorType = Kokkos::View; + using ViewVectorType = Kokkos::View; - std::string filename("bcsstk09.mtx"); // example matrix - std::string ortho("CGS2"); //orthog type - int m = 50; //Max subspace size before restarting. - double convTol = 1e-10; //Relative residual convergence tolerance. - int cycLim = 50; //Maximum number of times to restart the solver. - bool rand_rhs = false; //Generate random right-hand side. + std::string filename("bcsstk09.mtx"); // example matrix + std::string ortho("CGS2"); // orthog type + int m = 50; // Max subspace size before restarting. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. - for (int i=1;i solverOpts; - solverOpts.tol = convTol; - solverOpts.m = m; + solverOpts.tol = convTol; + solverOpts.m = m; solverOpts.maxRestart = cycLim; - solverOpts.ortho = ortho; + solverOpts.ortho = ortho; + solverOpts.verbose = false; // No verbosity needed for most testing - //Initialize Kokkos AFTER parsing parameters: + // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - - // Read in a matrix Market file and use it to test the Kokkos Operator. - KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix>(filename.c_str()); - - int n = A.numRows(); - ViewVectorType X("X",n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - - if(rand_rhs){ - // Make rhs random. - int rand_seed = 123; - Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random(B, pool, -1,1); - } - else{ - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - } - - // Run GMRS solve: - GmresStats solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - ST nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - ST endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=========================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - + // Read in a matrix Market file and use it to test the Kokkos Operator. + KokkosSparse::CrsMatrix A = + KokkosKernels::Impl::read_kokkos_crst_matrix< + KokkosSparse::CrsMatrix>(filename.c_str()); + + int n = A.numRows(); + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + + if (rand_rhs) { + // Make rhs random. + int rand_seed = 123; + Kokkos::Random_XorShift64_Pool<> pool(rand_seed); + Kokkos::fill_random(B, pool, -1, 1); + } else { + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + } + + // Run GMRS solve: + GmresStats solveStats = + gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + ST nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + ST endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=========================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; } Kokkos::finalize(); - } - diff --git a/example/gmres/gmres.hpp b/example/gmres/gmres.hpp index 48a6e4ae0d..22b23cde7a 100644 --- a/example/gmres/gmres.hpp +++ b/example/gmres/gmres.hpp @@ -117,10 +117,12 @@ struct GmresOpts { typename Kokkos::Details::ArithTraits::mag_type tol; int m; int maxRestart; + bool verbose; std::string ortho; std::string precSide; - GmresOpts() : tol(1e-8), m(50), maxRestart(50), ortho("CGS2") {} + GmresOpts() + : tol(1e-8), m(50), maxRestart(50), verbose(true), ortho("CGS2") {} }; template @@ -182,7 +184,9 @@ GmresStats gmres( MT nrmB, trueRes, relRes, shortRelRes; GmresStats myStats; - std::cout << "Convergence tolerance is: " << opts.tol << std::endl; + if (opts.verbose) { + std::cout << "Convergence tolerance is: " << opts.tol << std::endl; + } ViewVectorType Xiter( "Xiter", n); // Intermediate solution at iterations before restart. @@ -229,7 +233,9 @@ GmresStats gmres( relRes = 0; } shortRelRes = relRes; - std::cout << "Initial relative residual is: " << relRes << std::endl; + if (opts.verbose) { + std::cout << "Initial relative residual is: " << relRes << std::endl; + } if (relRes < opts.tol) { converged = true; } @@ -311,8 +317,10 @@ GmresStats gmres( GVec_h(j) = GVec_h(j) * CosVal_h(j); shortRelRes = fabs(GVec_h(j + 1)) / nrmB; - std::cout << "Shortcut relative residual for iteration " - << j + (cycle * m) << " is: " << shortRelRes << std::endl; + if (opts.verbose) { + std::cout << "Shortcut relative residual for iteration " + << j + (cycle * m) << " is: " << shortRelRes << std::endl; + } if (tmpNrm <= 1e-14 && shortRelRes >= opts.tol) { throw std::runtime_error( "GMRES has experienced lucky breakdown, but the residual has not converged.\n\ @@ -359,8 +367,10 @@ GmresStats gmres( KokkosBlas::axpy(-one, Wj, Res); // r = b-Ax. trueRes = KokkosBlas::nrm2(Res); relRes = trueRes / nrmB; - std::cout << "True relative residual for iteration " << j + (cycle * m) - << " is : " << relRes << std::endl; + if (opts.verbose) { + std::cout << "True relative residual for iteration " + << j + (cycle * m) << " is : " << relRes << std::endl; + } numIters = j + 1; if (relRes < opts.tol) { @@ -390,15 +400,21 @@ GmresStats gmres( std::cout << "Ending relative residual is: " << relRes << std::endl; myStats.endRelRes = static_cast(relRes); if (converged) { - std::cout << "Solver converged! " << std::endl; + if (opts.verbose) { + std::cout << "Solver converged! " << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::Conv; } else if (shortRelRes < opts.tol) { - std::cout << "Shortcut residual converged, but solver experienced a loss " - "of accuracy." - << std::endl; + if (opts.verbose) { + std::cout << "Shortcut residual converged, but solver experienced a loss " + "of accuracy." + << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::LOA; } else { - std::cout << "Solver did not converge. :( " << std::endl; + if (opts.verbose) { + std::cout << "Solver did not converge. :( " << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::NoConv; } if (cycle > 0) { @@ -406,8 +422,10 @@ GmresStats gmres( } else { myStats.numIters = 0; } - std::cout << "The solver completed " << myStats.numIters << " iterations." - << std::endl; + if (opts.verbose) { + std::cout << "The solver completed " << myStats.numIters << " iterations." + << std::endl; + } Kokkos::Profiling::popRegion(); return myStats; diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp index a19d6ad7e1..bc1ddce35b 100644 --- a/example/gmres/test_cmplx_A.cpp +++ b/example/gmres/test_cmplx_A.cpp @@ -65,6 +65,7 @@ int main(int /*argc*/, char** /*argv[]*/) { solverOpts.tol = 1e-05; // Relative residual convergence tolerance. solverOpts.maxRestart = 60; solverOpts.ortho = "CGS2"; // orthog type + solverOpts.verbose = false; // No verbosity needed for most testing bool pass1 = false; bool pass2 = false; diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 852a735aa6..71b17007d2 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -42,30 +42,29 @@ //@HEADER */ -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) { + typedef double ST; + typedef int OT; + typedef Kokkos::DefaultExecutionSpace EXSP; - typedef double ST; - typedef int OT; - typedef Kokkos::DefaultExecutionSpace EXSP; + using ViewVectorType = Kokkos::View; - using ViewVectorType = Kokkos::View; - - std::string ortho("CGS2"); //orthog type - int n = 1000; //Matrix size - int m = 50; //Max subspace size before restarting. - double convTol = 1e-10; //Relative residual convergence tolerance. - int cycLim = 50; //Maximum number of times to restart the solver. - bool rand_rhs = false; //Generate random right-hand side. + std::string ortho("CGS2"); // orthog type + int n = 1000; // Matrix size + int m = 50; // Max subspace size before restarting. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. bool pass = false; - for (int i=1;i solverOpts; - solverOpts.tol = convTol; - solverOpts.m = m; + solverOpts.tol = convTol; + solverOpts.m = m; solverOpts.maxRestart = cycLim; - solverOpts.ortho = ortho; + solverOpts.ortho = ortho; + solverOpts.verbose = false; // No verbosity needed for most testing - //Initialize Kokkos AFTER parsing parameters: + // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. - KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::kk_generate_diag_matrix>(n); - KokkosSparse::Experimental::MatrixPrec * myPrec = - new KokkosSparse::Experimental::MatrixPrec( - KokkosKernels::Impl::kk_generate_diag_matrix>(n, true)); + // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. + KokkosSparse::CrsMatrix A = + KokkosKernels::Impl::kk_generate_diag_matrix< + KokkosSparse::CrsMatrix>(n); + KokkosSparse::Experimental::MatrixPrec* + myPrec = + new KokkosSparse::Experimental::MatrixPrec( + KokkosKernels::Impl::kk_generate_diag_matrix< + KokkosSparse::CrsMatrix>(n, true)); - ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"),n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - int rand_seed = 123; - Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random(X, pool, -1,1); //Use non-zero initial guess to test GMRES properties. - if(rand_rhs){ - Kokkos::fill_random(B, pool, -1,1); - } - else{ - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - } + ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), + n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + int rand_seed = 123; + Kokkos::Random_XorShift64_Pool<> pool(rand_seed); + Kokkos::fill_random( + X, pool, -1, + 1); // Use non-zero initial guess to test GMRES properties. + if (rand_rhs) { + Kokkos::fill_random(B, pool, -1, 1); + } else { + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + } - GmresStats solveStats = gmres(A, B, X, solverOpts, myPrec); - - // Double check residuals at end of solve: - ST nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - ST endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=========================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - if( endRes < convTol && solveStats.numIters == 1){ - pass = true; - } + GmresStats solveStats = + gmres(A, B, X, solverOpts, myPrec); + // Double check residuals at end of solve: + ST nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + ST endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=========================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + if (endRes < convTol && solveStats.numIters == 1) { + pass = true; + } } Kokkos::finalize(); - if( pass ){ + if (pass) { std::cout << "Test passed!" << std::endl; - } - else{ + } else { std::cout << "Test Failed!" << std::endl; } - return ( pass ? EXIT_SUCCESS : EXIT_FAILURE ); + return (pass ? EXIT_SUCCESS : EXIT_FAILURE); } - diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp index 3f6edd06a3..26103da035 100644 --- a/example/gmres/test_real_A.cpp +++ b/example/gmres/test_real_A.cpp @@ -72,6 +72,7 @@ int main(int /*argc*/, char** /*argv[]*/) { solverOpts.m = 15; // Max subspace size before restarting. solverOpts.tol = 1e-10; // Relative residual convergence tolerance. solverOpts.maxRestart = 50; + solverOpts.verbose = false; // No verbosity needed for most testing bool pass1 = false; bool pass2 = false; From 106cc3499e5b30dc104d773c3ff7873d463ae1ad Mon Sep 17 00:00:00 2001 From: Jennifer Loe Date: Tue, 19 Apr 2022 15:47:58 -0600 Subject: [PATCH 087/261] Force clang formatter to rerun. --- example/gmres/ex_real_A.cpp | 2 +- example/gmres/test_prec.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 03eaeeff6e..1e3ba19585 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -64,7 +64,7 @@ int main(int argc, char* argv[]) { int m = 50; // Max subspace size before restarting. double convTol = 1e-10; // Relative residual convergence tolerance. int cycLim = 50; // Maximum number of times to restart the solver. - bool rand_rhs = false; // Generate random right-hand side. + bool rand_rhs = false; // Generate random right-hand side. for (int i = 1; i < argc; ++i) { const std::string& token = argv[i]; diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 71b17007d2..a75c9dc59a 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -61,8 +61,8 @@ int main(int argc, char* argv[]) { int m = 50; // Max subspace size before restarting. double convTol = 1e-10; // Relative residual convergence tolerance. int cycLim = 50; // Maximum number of times to restart the solver. - bool rand_rhs = false; // Generate random right-hand side. - bool pass = false; + bool rand_rhs = false; // Generate random right-hand side. + bool pass = false; for (int i = 1; i < argc; ++i) { const std::string& token = argv[i]; From 03f5e5e440f65d960d9e290024559a66d1fd5bf1 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Wed, 20 Apr 2022 14:13:24 -0600 Subject: [PATCH 088/261] Update the PR using Luc's comments --- example/CMakeLists.txt | 2 +- .../CMakeLists.txt | 2 +- .../examples_helper.hpp | 102 ++- .../static_pivoting.cpp} | 17 +- src/batched/dense/KokkosBatched_Gesv.hpp | 46 +- .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 614 ++++++++++++------ src/common/KokkosKernels_IOUtils.hpp | 75 +++ .../batched/dense/Test_Batched_SerialGesv.hpp | 22 +- .../dense/Test_Batched_SerialGesv_Real.hpp | 18 +- .../batched/dense/Test_Batched_TeamGesv.hpp | 21 +- .../dense/Test_Batched_TeamGesv_Real.hpp | 20 +- .../dense/Test_Batched_TeamVectorGesv.hpp | 20 +- .../Test_Batched_TeamVectorGesv_Real.hpp | 20 +- 13 files changed, 668 insertions(+), 311 deletions(-) rename example/{static_pivoting => batched_solve}/CMakeLists.txt (85%) rename example/{static_pivoting => batched_solve}/examples_helper.hpp (69%) rename example/{static_pivoting/example.cpp => batched_solve/static_pivoting.cpp} (90%) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 9dd8d09749..6ef9a91e55 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -7,4 +7,4 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) #ADD_SUBDIRECTORY(graph) ADD_SUBDIRECTORY(wiki) ADD_SUBDIRECTORY(gmres) -ADD_SUBDIRECTORY(static_pivoting) +ADD_SUBDIRECTORY(batched_solve) diff --git a/example/static_pivoting/CMakeLists.txt b/example/batched_solve/CMakeLists.txt similarity index 85% rename from example/static_pivoting/CMakeLists.txt rename to example/batched_solve/CMakeLists.txt index 3bfc7e8d95..da55b170cd 100644 --- a/example/static_pivoting/CMakeLists.txt +++ b/example/batched_solve/CMakeLists.txt @@ -3,5 +3,5 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( static_pivoting - SOURCES example.cpp + SOURCES static_pivoting.cpp ) \ No newline at end of file diff --git a/example/static_pivoting/examples_helper.hpp b/example/batched_solve/examples_helper.hpp similarity index 69% rename from example/static_pivoting/examples_helper.hpp rename to example/batched_solve/examples_helper.hpp index c9b5963c55..ffd774967b 100644 --- a/example/static_pivoting/examples_helper.hpp +++ b/example/batched_solve/examples_helper.hpp @@ -40,69 +40,65 @@ // ************************************************************************ //@HEADER -template -void write2DArrayToMM(std::string name, const XType x) { - std::ofstream myfile; - myfile.open(name); - - auto x_h = Kokkos::create_mirror_view(x); - - Kokkos::deep_copy(x_h, x); - - if (XType::Rank == 2) { - myfile << "%% MatrixMarket 2D Array\n%" << std::endl; - myfile << x_h.extent(0) << " " << x_h.extent(1) << std::endl; - - for (size_t i = 0; i < x_h.extent(0); ++i) { - for (size_t j = 0; j < x_h.extent(1); ++j) { - myfile << std::setprecision(15) << x_h(i, j) << " "; - } - myfile << std::endl; - } - - myfile.close(); - } -} - -template -void write3DArrayToMM(std::string name, const XType x) { - std::ofstream myfile; - myfile.open(name); - - auto x_h = Kokkos::create_mirror_view(x); - - Kokkos::deep_copy(x_h, x); - - if (XType::Rank == 3) { - myfile << "%% MatrixMarket 3D Array\n%" << std::endl; - myfile << x_h.extent(0) << " " << x_h.extent(1) << " " << x_h.extent(2) - << std::endl; - - for (size_t i = 0; i < x_h.extent(0); ++i) { - myfile << "Slice " << i << std::endl; - for (size_t j = 0; j < x_h.extent(1); ++j) { - for (size_t k = 0; k < x_h.extent(2); ++k) { - myfile << std::setprecision(15) << x_h(i, j, k) << " "; - } - myfile << std::endl; - } - } - - myfile.close(); - } -} +/// \brief create_saddle_point_matrices: +/// +/// This function creates the matrices and the rhs of a batched saddle point +/// systems where A and Y (the right hand side) are as follows: +/// +/// ___________ +/// | | T | +/// | B | C | +/// A = |-----+-----| +/// | C | 0 | +/// |_____|_____| +/// +/// _____ +/// | | +/// | D | +/// Y = |-----| +/// | 0 | +/// |_____| +/// +/// with A in R^{n \times n}, B in R^{(n-n_2) \times (n-n_2)} and +/// where B and C are computed as follows: +/// +/// 1. A sequence of n-n_2 points of R^{n_dim} is generated randomly: +/// x^(0), ..., x^(n-n_2-1) +/// 2. Given this sequence, the entries are computed as follows: +/// B_{(i,j)} = \| x^(i) - x^(j)\| +/// C_{(0,j)} = 1 +/// C_{(i,j)} = (x^(j))_{(i-1)} for i != 0 +/// +/// 3. D is generated randomly. +/// +/// This function uses a different sequence of x and a different D for every +/// systems within the batched system. +/// +/// As a consequence of its definitation, the diagonal of A is 0 for every +/// entries. +/// +/// \tparam MatrixViewType: type of the batched matrices +/// \tparam VectorViewType: type of the batched vectors +/// +/// \param A [in/out]: a rank 3 view that has to be prealocated that will store +/// the entries of the batched matrix. \param Y [in/out]: a rank 2 view that has +/// to be prealocated that will store the entries of the right hand side. \param +/// n_dim [in]: the dimension of the physical space where the points are +/// randomly generated (default = 3). +/// template void create_saddle_point_matrices(const MatrixViewType &A, - const VectorViewType &Y, const int n_2 = 4) { + const VectorViewType &Y, + const int n_dim = 3) { Kokkos::Random_XorShift64_Pool< typename MatrixViewType::device_type::execution_space> random(13718); const int N = A.extent(0); const int n = A.extent(1); + const int n_2 = n_dim + 1; const int n_1 = n - n_2; - const int n_dim = n_2 - 1; MatrixViewType xs("xs", N, n_1, n_dim); VectorViewType ys("ys", N, n_1); diff --git a/example/static_pivoting/example.cpp b/example/batched_solve/static_pivoting.cpp similarity index 90% rename from example/static_pivoting/example.cpp rename to example/batched_solve/static_pivoting.cpp index b703cb74ad..69ab25b62f 100644 --- a/example/static_pivoting/example.cpp +++ b/example/batched_solve/static_pivoting.cpp @@ -53,6 +53,7 @@ /// KokkosKernels headers #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" +#include "KokkosKernels_IOUtils.hpp" #include #include @@ -86,7 +87,10 @@ struct Functor_TeamTestStaticPivoting { auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamGesv::invoke(member, A, X, Y); + KokkosBatched::TeamGesv::invoke(member, + A, X, + Y); member.team_barrier(); } @@ -126,7 +130,8 @@ struct Functor_SerialTestStaticPivoting { auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL); auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); - KokkosBatched::SerialGesv::invoke(A, X, Y, tmp); + KokkosBatched::SerialGesv::invoke( + A, X, Y, tmp); } inline void run() { @@ -162,16 +167,16 @@ int main(int /*argc*/, char ** /*argv[]*/) { Kokkos::deep_copy(A2, A); Kokkos::deep_copy(Y2, Y); - write3DArrayToMM("A.mm", A); - write2DArrayToMM("Y.mm", Y); + KokkosKernels::Impl::kk_write_3Dview_to_file(A, "A.txt"); + KokkosKernels::Impl::kk_write_2Dview_to_file(Y, "Y.txt"); Functor_SerialTestStaticPivoting(A, tmp, X, Y) .run(); - write2DArrayToMM("X_serial.mm", X); + KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_serial.txt"); Functor_TeamTestStaticPivoting(A2, X, Y2) .run(); - write2DArrayToMM("X_team.mm", X); + KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_team.txt"); } Kokkos::finalize(); } diff --git a/src/batched/dense/KokkosBatched_Gesv.hpp b/src/batched/dense/KokkosBatched_Gesv.hpp index c0affa5fdf..08ad9644a0 100644 --- a/src/batched/dense/KokkosBatched_Gesv.hpp +++ b/src/batched/dense/KokkosBatched_Gesv.hpp @@ -49,6 +49,13 @@ namespace KokkosBatched { +struct Gesv { + struct StaticPivoting {}; + struct NoPivoting {}; + + using Default = StaticPivoting; +}; + /// \brief Serial Batched GESV: /// /// Solve A_l x_l = b_l for all l = 0, ..., N @@ -66,9 +73,22 @@ namespace KokkosBatched { /// must be N x n x (n+4) where N is the batched size and n is the number of /// rows. /// +/// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// /// No nested parallel_for is used inside of the function. /// +template struct SerialGesv { template KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, @@ -92,10 +112,21 @@ struct SerialGesv { /// \param X [out]: solution, a rank 2 view /// \param B [in]: right-hand side, a rank 2 view /// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// /// A nested parallel_for with TeamThreadRange is used. /// -template +template struct TeamGesv { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, @@ -119,11 +150,22 @@ struct TeamGesv { /// \param X [out]: solution, a rank 2 view /// \param B [in]: right-hand side, a rank 2 view /// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// /// Two nested parallel_for with both TeamVectorRange and ThreadVectorRange /// (or one with TeamVectorRange) are used inside. /// -template +template struct TeamVectorGesv { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 20bf334304..3f6cce79f7 100644 --- a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -47,13 +47,14 @@ #include "KokkosBatched_Util.hpp" #include #include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Copy_Decl.hpp" namespace KokkosBatched { struct SerialStaticPivoting { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); @@ -63,7 +64,7 @@ template struct TeamStaticPivoting { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); @@ -73,7 +74,7 @@ template struct TeamVectorStaticPivoting { template - KOKKOS_INLINE_FUNCTION static void invoke( + KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); @@ -81,87 +82,109 @@ struct TeamVectorStaticPivoting { template -KOKKOS_INLINE_FUNCTION void SerialStaticPivoting::invoke( +KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { using value_type = typename MatrixType1::non_const_value_type; - const int n = A.extent(0); + const size_t n = A.extent(0); - for (int i = 0; i < n; ++i) { - D2(i) = 0.; + // First, the algorithm loops over the rows and columns and search + // for the maximal absolute value per row and column. + for (size_t i = 0; i < n; ++i) { + D2(i) = Kokkos::ArithTraits::zero(); tmp_v_1(i) = 0; tmp_v_2(i) = 1.; - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); } D2(i) = 1. / D2(i); } - for (int i = 0; i < n; ++i) { - for (int j = 0; j < n; ++j) { + // Then, the inverse of the maximal value per column is used to scale + // A by the right. + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { A(i, j) *= D2(j); } } - for (int i = 0; i < n; ++i) { - value_type D1_i = 0.; - for (int j = 0; j < n; ++j) { + // Once again, the algorithm loops over the rows and store the maximal + // absolute value per row but after the right scalling and do a left scalling + // of A and Y. + value_type D1_i; + for (size_t i = 0; i < n; ++i) { + D1_i = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); } D1_i = 1. / D1_i; - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { A(i, j) *= D1_i; } Y(i) *= D1_i; } - for (int i = 0; i < n; ++i) { + // Finally, the algorithm starts to loop over the rows in an order such that + // their initial maximal absolute value decrease (it uses the tmp_v_1 to do + // so), then for a given row, it finds the available column with the largest + // absolute value. If this value is zero, the algorithm failed to compute a + // good pivot, otherwise it puts the current row to the found column index and + // it labels the row and column index as unavailable and continue the loop + // over the rows. + // + for (size_t i = 0; i < n; ++i) { int row_index = 0; int col_index = 0; - value_type tmp_0 = 0.; - value_type tmp_1 = 0.; - for (int j = 0; j < n; ++j) { + value_type tmp_0 = Kokkos::ArithTraits::zero(); + value_type tmp_1 = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { if (tmp_0 < tmp_v_1(j)) { tmp_0 = tmp_v_1(j); row_index = j; } } - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { if (tmp_1 < Kokkos::abs(A(row_index, j) * tmp_v_2(j))) { tmp_1 = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); col_index = j; } } - tmp_v_1(row_index) = 0.; - tmp_v_2(col_index) = 0.; + if (tmp_1 == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { PDAD(col_index, j) = A(row_index, j); } PDY(col_index) = Y(row_index); } + + return 0; } template template -KOKKOS_INLINE_FUNCTION void TeamStaticPivoting::invoke( +KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { using value_type = typename MatrixType1::non_const_value_type; using reducer_value_type = typename Kokkos::MaxLoc::value_type; + // This implementation follows the strategy of SerialStaticPivoting but uses + // an extra level of parallelism. + // Made this non-const in order to WORKAROUND issue #349 (Credit to C. Trott) - int n = A.extent(0); + size_t n = A.extent(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - D2(i) = 0.; + D2(i) = Kokkos::ArithTraits::zero(); tmp_v_1(i) = 0; tmp_v_2(i) = 1.; - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); } @@ -169,24 +192,24 @@ KOKKOS_INLINE_FUNCTION void TeamStaticPivoting::invoke( }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { A(i, j) *= D2(j); } }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - value_type D1_i = 0.; - for (int j = 0; j < n; ++j) { + value_type D1_i = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); } D1_i = 1. / D1_i; - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { A(i, j) *= D1_i; } Y(i) *= D1_i; }); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { int row_index, col_index; reducer_value_type value; Kokkos::MaxLoc reducer_value(value); @@ -209,31 +232,36 @@ KOKKOS_INLINE_FUNCTION void TeamStaticPivoting::invoke( } }, reducer_value); - col_index = value.loc; - tmp_v_1(row_index) = 0.; - tmp_v_2(col_index) = 0.; + col_index = value.loc; + if (value.val == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); - for (int j = 0; j < n; ++j) { + for (size_t j = 0; j < n; ++j) { PDAD(col_index, j) = A(row_index, j); } PDY(col_index) = Y(row_index); } + return 0; } template template -KOKKOS_INLINE_FUNCTION void TeamVectorStaticPivoting::invoke( +KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { using value_type = typename MatrixType1::non_const_value_type; using reducer_value_type = typename Kokkos::MaxLoc::value_type; - const int n = A.extent(0); + // This implementation follows the strategy of SerialStaticPivoting but uses + // two extra levels of parallelism. + + const size_t n = A.extent(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - D2(i) = 0.; + D2(i) = Kokkos::ArithTraits::zero(); tmp_v_1(i) = 0; tmp_v_2(i) = 1.; reducer_value_type value; @@ -266,7 +294,7 @@ KOKKOS_INLINE_FUNCTION void TeamVectorStaticPivoting::invoke( }); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { - value_type D1_i = 0.; + value_type D1_i = Kokkos::ArithTraits::zero(); reducer_value_type value; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( @@ -284,7 +312,7 @@ KOKKOS_INLINE_FUNCTION void TeamVectorStaticPivoting::invoke( Y(i) *= D1_i; }); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { int row_index, col_index; reducer_value_type value; Kokkos::MaxLoc reducer_value(value); @@ -307,24 +335,26 @@ KOKKOS_INLINE_FUNCTION void TeamVectorStaticPivoting::invoke( } }, reducer_value); - col_index = value.loc; - tmp_v_1(row_index) = 0.; - tmp_v_2(col_index) = 0.; + col_index = value.loc; + if (value.val == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { PDAD(col_index, j) = A(row_index, j); }); PDY(col_index) = Y(row_index); } + return 0; } template KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, const VectorType2 D, const VectorType3 DX) { - const int n = X.extent(0); + const size_t n = X.extent(0); - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < n; ++i) { DX(i) = D(i) * X(i); } } @@ -335,10 +365,10 @@ KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, const VectorType1 X, const VectorType2 D, const VectorType3 DX) { - const int n = X.extent(0); + const size_t n = X.extent(0); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), - [&](const int &i) { DX(i) = D(i) * X(i); }); + [&](const size_t &i) { DX(i) = D(i) * X(i); }); } template -KOKKOS_INLINE_FUNCTION int SerialGesv::invoke(const MatrixType A, - const VectorType X, - const VectorType Y, - const MatrixType tmp) { +template <> +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType tmp) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); - - // Check compatibility of dimensions at run time. - - if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " - "%d x %d, tmp (note: its second dimension should be the second " - "dimension of A + 4): %d x %d\n", - (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), - (int)tmp.extent(1)); - return 1; - } + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + + if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " + "%d x %d, tmp (note: its second dimension should be the second " + "dimension of A + 4): %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), + (int)tmp.extent(1)); + return 1; + } - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); - return 1; - } + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } #endif - const int n = A.extent(0); + const int n = A.extent(0); - auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); - auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); - auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); - auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); - auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2); + if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == + 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } - SerialLU::invoke(PDAD); + SerialLU::invoke(PDAD); - SerialTrsm::invoke(1.0, PDAD, PDY); + SerialTrsm::invoke(1.0, PDAD, PDY); - SerialTrsm::invoke(1.0, PDAD, PDY); + SerialTrsm::invoke(1.0, PDAD, PDY); - SerialHadamard1D(PDY, D2, X); - return 0; -} + SerialHadamard1D(PDY, D2, X); + return 0; + } +}; + +template <> +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType /*tmp*/) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + + SerialLU::invoke(A); + + SerialCopy::invoke(Y, X); + SerialTrsm::invoke(1.0, A, X); + + SerialTrsm::invoke(1.0, A, X); + + return 0; + } +}; /// /// Team Impl /// ========= template -template -KOKKOS_INLINE_FUNCTION int TeamGesv::invoke( - const MemberType &member, const MatrixType A, const VectorType X, - const VectorType Y) { +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); - - // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); - return 1; + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = Kokkos::View< + typename MatrixType::non_const_value_type **, + typename MatrixType::array_layout, + typename MatrixType::execution_space::scratch_memory_space>; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, + tmp_v_1, tmp_v_2) == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } + member.team_barrier(); + + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, + PDY); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, PDAD, + PDY); + member.team_barrier(); + + TeamHadamard1D(member, PDY, D2, X); + member.team_barrier(); + return 0; } +}; + +template +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } #endif - using ScratchPadMatrixViewType = - Kokkos::View; - - const int n = A.extent(0); - - ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); - auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); - auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); - auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); - auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); - auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - - TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, tmp_v_1, - tmp_v_2); - member.team_barrier(); - - TeamLU::invoke(member, PDAD); - member.team_barrier(); - - TeamTrsm::invoke(member, 1.0, PDAD, PDY); - member.team_barrier(); - - TeamTrsm::invoke(member, 1.0, PDAD, - PDY); - member.team_barrier(); - - TeamHadamard1D(member, PDY, D2, X); - member.team_barrier(); - return 0; -} + + TeamLU::invoke(member, A); + member.team_barrier(); + + TeamCopy::invoke(member, Y, X); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, A, X); + member.team_barrier(); + + TeamTrsm::invoke(member, 1.0, A, X); + member.team_barrier(); + + return 0; + } +}; /// /// TeamVector Impl /// ========= template -template -KOKKOS_INLINE_FUNCTION int TeamVectorGesv::invoke( - const MemberType &member, const MatrixType A, const VectorType X, - const VectorType Y) { +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); - static_assert(Kokkos::is_view::value, - "KokkosBatched::gesv: VectorType is not a Kokkos::View."); - static_assert(MatrixType::Rank == 2, - "KokkosBatched::gesv: MatrixType must have rank 2."); - static_assert(VectorType::Rank == 1, - "KokkosBatched::gesv: VectorType must have rank 1."); - - // Check compatibility of dimensions at run time. - if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || - A.extent(0) != Y.extent(0)) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " - "%d x %d, X: %d, Y: %d\n", - (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), (int)Y.extent(0)); - return 1; + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = Kokkos::View< + typename MatrixType::non_const_value_type **, + typename MatrixType::array_layout, + typename MatrixType::execution_space::scratch_memory_space>; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + if (TeamVectorStaticPivoting::invoke( + member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } + + member.team_barrier(); + + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + TeamVectorTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + + TeamVectorTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + + TeamVectorHadamard1D(member, PDY, D2, X); + member.team_barrier(); + return 0; } +}; + +template +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } #endif - using ScratchPadMatrixViewType = - Kokkos::View; - - const int n = A.extent(0); - - ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); - auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); - auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); - auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); - auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); - auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); - - TeamVectorStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, - tmp_v_1, tmp_v_2); - member.team_barrier(); - - TeamLU::invoke(member, PDAD); - member.team_barrier(); - - TeamTrsm::invoke(member, 1.0, PDAD, PDY); - member.team_barrier(); - - TeamTrsm::invoke(member, 1.0, PDAD, - PDY); - member.team_barrier(); - - TeamVectorHadamard1D(member, PDY, D2, X); - member.team_barrier(); - return 0; -} + + TeamLU::invoke(member, A); + member.team_barrier(); + + TeamVectorCopy::invoke(member, Y, X); + member.team_barrier(); + + TeamVectorTrsm::invoke(member, 1.0, A, + X); + member.team_barrier(); + + TeamVectorTrsm::invoke(member, 1.0, + A, X); + member.team_barrier(); + + return 0; + } +}; } // namespace KokkosBatched diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index bf1f3b4bfc..b0575197b0 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -550,6 +550,81 @@ inline void kk_read_1Dview_from_file(idx_array_type &view, Kokkos::fence(); } +template +inline void kk_write_2Dview_to_file(idx_array_type view, const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + Kokkos::deep_copy(host_view, view); + Kokkos::fence(); + std::ofstream myFile(filename, std::ios::out); + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + myFile << host_view(i, j) << " "; + } + myFile << std::endl; + } + myFile.close(); +} + +template +inline void kk_read_2Dview_from_file(idx_array_type &view, + const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + std::ifstream myFile(filename, std::ios::in); + + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + myFile >> host_view(i, j); + } + } + myFile.close(); + Kokkos::deep_copy(view, host_view); + Kokkos::fence(); +} + +template +inline void kk_write_3Dview_to_file(idx_array_type view, const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + Kokkos::deep_copy(host_view, view); + Kokkos::fence(); + std::ofstream myFile(filename, std::ios::out); + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + for (size_t k = 0; k < view.extent(2); ++k) { + myFile << host_view(i, j, k) << " "; + } + myFile << std::endl; + } + myFile << std::endl; + } + myFile.close(); +} + +template +inline void kk_read_3Dview_from_file(idx_array_type &view, + const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + std::ifstream myFile(filename, std::ios::in); + + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + for (size_t k = 0; k < view.extent(2); ++k) { + myFile >> host_view(i, j, k); + } + } + } + myFile.close(); + Kokkos::deep_copy(view, host_view); + Kokkos::fence(); +} + template void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, idx *lower_triangle_srcs, diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp index 15fe7dfacc..233d6bedf3 100644 --- a/unit_test/batched/dense/Test_Batched_SerialGesv.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp @@ -17,7 +17,8 @@ using namespace KokkosBatched; namespace Test { namespace Gesv { -template +template struct Functor_TestBatchedSerialGesv { const MatrixType _A; const MatrixType _tmp; @@ -36,7 +37,7 @@ struct Functor_TestBatchedSerialGesv { auto b = Kokkos::subview(_B, k, Kokkos::ALL); auto tmp = Kokkos::subview(_tmp, k, Kokkos::ALL, Kokkos::ALL); - KokkosBatched::SerialGesv::invoke(A, x, b, tmp); + KokkosBatched::SerialGesv::invoke(A, x, b, tmp); } inline void run() { @@ -51,7 +52,8 @@ struct Functor_TestBatchedSerialGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::Details::ArithTraits ats; @@ -81,8 +83,8 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedSerialGesv(A, tmp, X, - B) + Functor_TestBatchedSerialGesv(A, tmp, X, B) .run(); Kokkos::fence(); @@ -106,7 +108,7 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { } // namespace Gesv } // namespace Test -template +template int test_batched_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { @@ -116,8 +118,8 @@ int test_batched_gesv() { VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv( - 1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif @@ -129,8 +131,8 @@ int test_batched_gesv() { VectorType; for (int i = 3; i < 10; ++i) { - Test::Gesv::impl_test_batched_gesv( - 1024, i); + Test::Gesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp index f8d391a428..84a630efa3 100644 --- a/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp @@ -1,11 +1,19 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_gesv_float) { - test_batched_gesv(); +TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_float) { + test_batched_gesv(); +} +TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_float) { + test_batched_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_gesv_double) { - test_batched_gesv(); +TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_double) { + test_batched_gesv(); +} +TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_double) { + test_batched_gesv(); } -#endif \ No newline at end of file +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp index bdef5eb68d..8f6bcf9f9d 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGesv.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp @@ -17,7 +17,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamGesv { -template +template struct Functor_TestBatchedTeamGesv { const MatrixType _A; const VectorType _X; @@ -36,7 +37,7 @@ struct Functor_TestBatchedTeamGesv { auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamGesv::invoke(member, A, x, b); + KokkosBatched::TeamGesv::invoke(member, A, x, b); member.team_barrier(); } @@ -63,7 +64,8 @@ struct Functor_TestBatchedTeamGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::Details::ArithTraits ats; @@ -92,7 +94,8 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamGesv(A, X, B) + Functor_TestBatchedTeamGesv( + A, X, B) .run(); Kokkos::fence(); @@ -116,7 +119,7 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { } // namespace TeamGesv } // namespace Test -template +template int test_batched_team_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { @@ -126,8 +129,8 @@ int test_batched_team_gesv() { VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif @@ -139,8 +142,8 @@ int test_batched_team_gesv() { VectorType; for (int i = 3; i < 10; ++i) { - Test::TeamGesv::impl_test_batched_gesv(1024, i); + Test::TeamGesv::impl_test_batched_gesv(1024, i); } } #endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp index 6b01a23d65..8dca15a4a2 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp @@ -1,11 +1,21 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_gesv_float) { - test_batched_team_gesv(); +TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) { + test_batched_team_gesv(); +} +TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { + test_batched_team_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_gesv_double) { - test_batched_team_gesv(); +TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { + test_batched_team_gesv(); +} +TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { + test_batched_team_gesv(); } -#endif \ No newline at end of file +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp index beac7b2e45..9ee05cb919 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp @@ -17,7 +17,8 @@ using namespace KokkosBatched; namespace Test { namespace TeamVectorGesv { -template +template struct Functor_TestBatchedTeamVectorGesv { const MatrixType _A; const VectorType _X; @@ -36,7 +37,8 @@ struct Functor_TestBatchedTeamVectorGesv { auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); member.team_barrier(); - KokkosBatched::TeamVectorGesv::invoke(member, A, x, b); + KokkosBatched::TeamVectorGesv::invoke(member, A, x, + b); member.team_barrier(); } @@ -63,7 +65,8 @@ struct Functor_TestBatchedTeamVectorGesv { } }; -template +template void impl_test_batched_gesv(const int N, const int BlkSize) { typedef typename MatrixType::value_type value_type; typedef Kokkos::Details::ArithTraits ats; @@ -92,7 +95,8 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { Kokkos::fence(); - Functor_TestBatchedTeamVectorGesv(A, X, B) + Functor_TestBatchedTeamVectorGesv(A, X, B) .run(); Kokkos::fence(); @@ -116,7 +120,7 @@ void impl_test_batched_gesv(const int N, const int BlkSize) { } // namespace TeamVectorGesv } // namespace Test -template +template int test_batched_teamvector_gesv() { #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) { @@ -127,7 +131,8 @@ int test_batched_teamvector_gesv() { for (int i = 3; i < 10; ++i) { Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); + VectorType, AlgoTagType>( + 1024, i); } } #endif @@ -140,7 +145,8 @@ int test_batched_teamvector_gesv() { for (int i = 3; i < 10; ++i) { Test::TeamVectorGesv::impl_test_batched_gesv(1024, i); + VectorType, AlgoTagType>( + 1024, i); } } #endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp index a589f4aa2b..d83706718c 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp @@ -1,11 +1,21 @@ #if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_teamvector_gesv_float) { - test_batched_teamvector_gesv(); +TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) { + test_batched_teamvector_gesv(); +} +TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { + test_batched_teamvector_gesv(); } #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_teamvector_gesv_double) { - test_batched_teamvector_gesv(); +TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { + test_batched_teamvector_gesv(); +} +TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { + test_batched_teamvector_gesv(); } -#endif \ No newline at end of file +#endif From c65915d1af662d5e72b689858152c2459ac5a7a7 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Thu, 21 Apr 2022 06:59:32 -0600 Subject: [PATCH 089/261] Reset the reducer values after the first search --- src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 3f6cce79f7..a05386642e 100644 --- a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -223,6 +223,8 @@ KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( }, reducer_value); row_index = value.loc; + value.loc = 0; + value.val = 0.; Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member, n), [&](const int &j, reducer_value_type &update) { @@ -326,6 +328,8 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( }, reducer_value); row_index = value.loc; + value.loc = 0; + value.val = 0.; Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, n), [&](const int &j, reducer_value_type &update) { From ffcd6adfd555669b37f5fb7eda27df8a5048db33 Mon Sep 17 00:00:00 2001 From: Kim Liegeois Date: Thu, 21 Apr 2022 08:27:23 -0600 Subject: [PATCH 090/261] Use Kokkos::ArithTraits::zero() instead of 0. --- src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index a05386642e..5a07a58990 100644 --- a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -224,7 +224,7 @@ KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( reducer_value); row_index = value.loc; value.loc = 0; - value.val = 0.; + value.val = Kokkos::ArithTraits::zero(); Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member, n), [&](const int &j, reducer_value_type &update) { @@ -329,7 +329,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( reducer_value); row_index = value.loc; value.loc = 0; - value.val = 0.; + value.val = Kokkos::ArithTraits::zero(); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, n), [&](const int &j, reducer_value_type &update) { From 41070dc8733ba8cce46fd34a9322efd362c29dd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 14:40:33 +0100 Subject: [PATCH 091/261] Copy SpGEMM files for reference diff --- .../KokkosKernels_BlockHashmapAccumulator.hpp | 821 ++++++++ ...parse_bspgemm_numeric_eti_spec_inst.cpp.in | 53 + ...arse_bspgemm_numeric_eti_spec_avail.hpp.in | 51 + ...parse_bspgemm_numeric_eti_spec_decl.hpp.in | 51 + src/sparse/impl/KokkosSparse_bspgemm_impl.hpp | 852 ++++++++ .../impl/KokkosSparse_bspgemm_impl_def.hpp | 294 +++ .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 1855 +++++++++++++++++ .../impl/KokkosSparse_bspgemm_impl_seq.hpp | 234 +++ .../impl/KokkosSparse_bspgemm_impl_speed.hpp | 637 ++++++ .../KokkosSparse_bspgemm_numeric_spec.hpp | 436 ++++ unit_test/sparse/Test_Sparse_bspgemm.hpp | 459 ++++ 11 files changed, 5743 insertions(+) create mode 100644 src/common/KokkosKernels_BlockHashmapAccumulator.hpp create mode 100644 src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_impl.hpp create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp create mode 100644 src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp create mode 100644 unit_test/sparse/Test_Sparse_bspgemm.hpp diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp new file mode 100644 index 0000000000..b7f39f75c2 --- /dev/null +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -0,0 +1,821 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP +#define _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP +#include +#include + +//#define HASHMAPACCUMULATOR_ASSERT_ENABLED + +namespace KokkosKernels { + +namespace Experimental { + +/** + * @brief types of hash operations supported by HashmapAccumulator. + * + * /var bitwiseAnd: Performs key & hashOpRHS + * /var modulo: Performs key % hashOpRHS + * /var pow2Modulo: Performs key & (hashOpRHS - 1) + */ +struct HashOpType { + struct bitwiseAnd {}; + struct modulo {}; + struct pow2Modulo {}; +}; + +template +/** + * \brief HashmapAccumulator class + * The use of this is described in the paper: + * "Performance-portable sparse matrix-matrix multiplication for many-core + * architectures" ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in + * section III.D + * + * Public members: + * \var hash_begins: Holds the beginning indices of the linked lists + * corresponding to hash values [Begins] + * \var hash_nexts: Holds the indicies of the next elements + * within the linked list [Nexts] + * \var keys: This stores the column indices of the crs matrix [Ids] + * \var values: This store the numerical values (matrix elements) [Values] + * + * Private members: + * \var __max_value_size: The length of the two arrays (keys and hash_nexts) + * \var __hashOpRHS: The right hand side of the requested hash operation. + * \var __insert_success: Value to return upon insertion success. + * \var __insert_full: Value to return upon insertion failure. + */ +struct HashmapAccumulator { + // begin public members + // issue-508, TODO: It's best for used_size to be an internal member of this + // class but the current use-cases rely on used_size to be a parameter to the + // below insertion routines. One way to remove used_size as a parameter to the + // insertion routines is to instantiate multiple HashmapAccumulator objects + // (one hashmap for each team of threads) instead of using a single + // HashmapAccumulator object for multiple teams of threads; this entails + // major refactoring throughout the kokkos-kernels code base. + // Making used_size a pointer and private member of this + // class still exposes access to this member outside of the class and is + // not a good option. + // size_type used_size; + + // issue-508, TODO: The hash_begins, hash_nexts, keys, values, + // __insert_success, and __insert_full members should all be private as well. + // They should be managed solely by this HashmapAccumulator class: initialized + // in the constructor(s) and only managed by HashmapAccumulator insertion + // routines. Making these members private requires major refactoring + // throughout the kokkos-kernels code base. If allocations for these members + // must really live outside this class, we need new members that break + // __max_value_size into: hash_begins_len, hash_nexts_len, keys_len, and + // values_len...! + + size_type *hash_begins; + size_type *hash_nexts; + key_type *keys; + value_type *values; + + /** + * \brief default constructor HashmapAccumulator + * Sets used_size to 0, __insert_success to 0, __insert_full to 1, and + * __hashOpRHS to 0. + * + * Assumption: hash_begins_ are all initialized to -1. + */ + KOKKOS_INLINE_FUNCTION + HashmapAccumulator() + : hash_begins(), + hash_nexts(), + keys(), + values(), + __max_value_size(), + __hashOpRHS(0) {} + + /** + * \brief parameterized constructor HashmapAccumulator + * Sets used_size to 0, __insert_success to 0, and __insert_full to 1. + * + * \param max_value_size_: The length of the two arrays (keys and hash_nexts) + * \param hashOpRHS: The right hand side of the requested hash + * operation. \param hash_begins_: Holds the beginning indices of the + * linked lists corresponding to hash values [Begins] \param hash_nexts_: + * Holds the indicies of the next elements within the linked list [Nexts] + * \param keys_: This stores the column indices of (??) [Ids] + * \param values_: This store the (matrix element?) numerical value of + * (??) [Values] + * + * Assumption: hash_begins_ are all initialized to -1. + */ + KOKKOS_INLINE_FUNCTION + HashmapAccumulator(const size_type max_value_size_, const size_type hashOpRHS, + size_type *hash_begins_, size_type *hash_nexts_, + key_type *keys_, value_type *values_) + : hash_begins(hash_begins_), + hash_nexts(hash_nexts_), + keys(keys_), + values(values_), + __max_value_size(max_value_size_), + __hashOpRHS(hashOpRHS) { + // Substract 1 and use the bitwiseAnd __compute_hash member. + if (std::is_same::value) { + __hashOpRHS -= 1; + } + } + + // function to be called from device. + // Accumulation is OR operation. + // Insertion is sequential, no race condition for the insertion. + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, + value_type value, + size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_index; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] | value; + return __insert_success; + } + } + + if (*used_size_ >= __max_value_size) return __insert_full; + my_index = (*used_size_)++; + + if (hash_begins[hash] == -1) { + used_hashes[used_hash_size[0]++] = hash; + } + hash_nexts[my_index] = hash_begins[hash]; + + hash_begins[hash] = my_index; + keys[my_index] = key; + values[my_index] = value; + return __insert_success; + } + + // function to be called from device. + // Accumulation is OR operation. + // TODO: This function is for triangle counting. + // Assume that there are 2 values for triangle count. + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes( + key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_index; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values2[i] = values2[i] | (values[i] & value); + values[i] = values[i] | value; + return __insert_success; + } + } + + if (*used_size_ >= __max_value_size) return __insert_full; + my_index = (*used_size_)++; + + if (hash_begins[hash] == -1) { + used_hashes[used_hash_size[0]++] = hash; + } + hash_nexts[my_index] = hash_begins[hash]; + + hash_begins[hash] = my_index; + keys[my_index] = key; + values[my_index] = value; + values2[my_index] = 0; + return __insert_success; + } + + // this is used in slow triangle counting method. + // L x Incidence + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( + key_type key, value_type value, value_type *values2, + size_type * /*used_size_*/, size_type * /*used_hash_size*/, + size_type * /*used_hashes*/) { + size_type hash, i; + + if (key == -1) return __insert_success; + + // this function will only try to do an AND operation with + // existing keys. If the key is not there, returns __insert_full. + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + // values2[i] = values2[i] | (values[i] & value); + values[i] = values[i] & value; + ++values2[i]; + return __insert_success; + } + } + return __insert_full; + } + + // this is used in LxL or Incidence^T x L + KOKKOS_INLINE_FUNCTION + value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( + key_type key, value_type value) { + size_type hash, i; + + if (key == -1) return __insert_success; + + // this function will only try to do an AND operation with + // existing keys. If the key is not there, returns __insert_full. + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return values[i] & value; + } + } + return 0; + } + + // this is used in slow triangle counting method. + // L x Incidence + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_TriangleCount_TrackHashes( + key_type key, value_type value, value_type *values2, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, my_index; + + if (key == -1) return __insert_success; + + // this function will directly insert, won't check if it exists already. + if (*used_size_ >= __max_value_size) return __insert_full; + my_index = (*used_size_)++; + + keys[my_index] = key; + values[my_index] = value; + values2[my_index] = 1; + + hash = __compute_hash(key, __hashOpRHS); + if (hash_begins[hash] == -1) { + hash_begins[hash] = my_index; + used_hashes[used_hash_size[0]++] = hash; + } else { + hash_nexts[my_index] = hash_begins[hash]; + hash_begins[hash] = my_index; + } + return __insert_success; + } + + // this is used in LxL or Incidence^T x L + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_TriangleCount_TrackHashes( + key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) // issue-508, TODO figure out what this + // "used_hashes" is for + { + size_type hash, my_index; + + if (key == -1) return __insert_success; + + // this function will directly insert, won't check if it exists already. + if (*used_size_ >= __max_value_size) return __insert_full; + my_index = (*used_size_)++; + + keys[my_index] = key; + values[my_index] = value; + + hash = __compute_hash(key, __hashOpRHS); + if (hash_begins[hash] == -1) { + hash_begins[hash] = my_index; + used_hashes[used_hash_size[0]++] = hash; + } else { + hash_nexts[my_index] = hash_begins[hash]; + hash_begins[hash] = my_index; + } + return __insert_success; + } + + // function to be called from device. + // Insertion is sequential, no race condition for the insertion. + // the mergeadd used in the numeric of KKMEM. + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_mergeAdd_TrackHashes( + key_type key, value_type value, size_type *used_size_, + size_type *used_hash_size, size_type *used_hashes) { + size_type hash, i, my_index; + + if (key == -1) return __insert_success; + + // issue-508, TODO: ensure that i < __max_value_size, but + // need information about length of keys, values, and hash_nexts first! + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] + value; + return __insert_success; + } + } + + my_index = (*used_size_)++; + + if (hash_begins[hash] == -1) { + used_hashes[used_hash_size[0]++] = hash; + } + hash_nexts[my_index] = hash_begins[hash]; + + hash_begins[hash] = my_index; + keys[my_index] = key; + values[my_index] = value; + return __insert_success; + } + + // no values. simply adds to the keys. + // used in the compression to count the sets. + // also used in the symbolic of spgemm if no compression is applied. + KOKKOS_INLINE_FUNCTION + int sequential_insert_into_hash_TrackHashes(key_type key, + size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_index; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return __insert_success; + } + } + + my_index = (*used_size_)++; + + if (hash_begins[hash] == -1) { + used_hashes[used_hash_size[0]++] = hash; + } + hash_nexts[my_index] = hash_begins[hash]; + + hash_begins[hash] = my_index; + keys[my_index] = key; + return __insert_success; + } + + // used in the kkmem's numeric phase for second level hashmaps. + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( + const key_type key, const value_type value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + if (hash != -1) { + i = hash_begins[hash]; + + for (; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] + value; + return __insert_success; + } + } + } else { + return __insert_success; + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + + // Neither the compiler nor the execution unit can re-order the line + // directly below with the next line performing the atomic_exchange as the + // atomic exchange writes to hash_begins[hash] and this line reads from + // hash_begins[hash]. + // This line is needed such that threads of execution can still access the + // old linked list, after hash_begins+hash has been atomically overwritten + // with my_write_index but before hash_nexts[my_write_index] is + // overwritten with hashbeginning. If this line was not here, threads may + // not be able to access the dangling linked list since + // hash_nexts[my_write_index] would still be -1. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from + // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502 + template + KOKKOS_INLINE_FUNCTION int + vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, + size_type hash, const key_type key, const value_type value, + volatile size_type *used_size_, const size_type max_value_size_) { + // Cannot compute hash here due to impl_speed use-case + // hash = __compute_hash(key, __hashOpRHS); + if (key == -1) return __insert_success; + + if (hash != -1) { + size_type i = hash_begins[hash]; + for (; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] + value; + return __insert_success; + } + } + } else { + return __insert_success; + } + + // Ensure that threads don't continue incrementing used_size_ if the hashmap + // is full, used_size_ could overflow and result in undefined behavior. + if (used_size_[0] >= max_value_size_) { + return __insert_full; + } + size_type my_write_index = + Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= max_value_size_) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + + // Neither the compiler nor the execution unit can re-order the line + // directly below with the next line performing the atomic_exchange as the + // atomic exchange writes to hash_begins[hash] and this line reads from + // hash_begins[hash]. + // This line is needed such that threads of execution can still access the + // old linked list, after hash_begins+hash has been atomically overwritten + // with my_write_index but before hash_nexts[my_write_index] is + // overwritten with hashbeginning. If this line was not here, threads may + // not be able to access the dangling linked list since + // hash_nexts[my_write_index] would still be -1. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + // Atomically: + // hashbeginning = hash_begins[hash] + // hash_begins[hash] = my_write_index + // hash_nexts[my_write_index] = hash_begins[hash] + size_type hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // used in kkmem's numeric phase to insert to first level hashmaps. + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, + const value_type value, + volatile size_type *used_size_) { + if (key == -1) return __insert_success; + + return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, + __max_value_size); + } + + // used in symbolic of kkmem if the compression is not applied. + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash(const key_type &key, + volatile size_type *used_size_) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeOr(const key_type &key, + const value_type &value, + volatile size_type *used_size_) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] | value; + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeOr_TrackHashes( + const key_type &key, const value_type &value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] | value; + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, + volatile size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + // end public members + private: + size_type __max_value_size; + size_type __hashOpRHS; + static constexpr int __insert_success = 0; + static constexpr int __insert_full = 1; + + template ::value || + std::is_same::value, + std::size_t>::type = 0> + KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { + size_type hash = key & bitmask; +#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED + if (hash == -1) Kokkos::abort("__compute_hash: hash = -1"); + if (key == -1) Kokkos::abort("__compute_hash: key = -1"); +#endif // HASHMAPACCUMULATOR_ASSERT_ENABLED + return hash; + } + + template ::value, + std::size_t>::type = 0> + KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { + size_type hash = key % divisor; +#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED + if (hash == -1) Kokkos::abort("__compute_hash: hash = -1"); + if (key == -1) Kokkos::abort("__compute_hash: key = -1"); +#endif // HASHMAPACCUMULATOR_ASSERT_ENABLED + return hash; + } + // private +}; // struct HashmapAccumulator + +} // namespace Experimental +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP diff --git a/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..69f8fce032 --- /dev/null +++ b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spgemm_numeric_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NUMERIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..c1edd15270 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NUMERIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..6b31499d52 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp new file mode 100644 index 0000000000..09a8bf212a --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -0,0 +1,852 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSSPGEMMIMPL_HPP +#define _KOKKOSSPGEMMIMPL_HPP + +//#define KOKKOSKERNELS_ANALYZE_COMPRESSION +//#define KOKKOSKERNELS_ANALYZE_MEMORYACCESS +//#define HASHTRACK + +//#define TRACK_INSERTS +//#define GPU_EXPERIMENTAL +//#define NUMERIC_USE_STATICMEM +//#define twostep +#include +#include +#include +#include +#include +#include +#include +#include + +#include "KokkosKernels_HashmapAccumulator.hpp" +#include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" +#include "KokkosSparse_spgemm_handle.hpp" +#include "KokkosGraph_Distance1Color.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +class KokkosSPGEMM { + public: + typedef a_row_view_t_ a_row_view_t; + typedef a_lno_nnz_view_t_ a_in_lno_nnz_view_t; + typedef a_scalar_nnz_view_t_ a_in_scalar_nnz_view_t; + + typedef b_lno_row_view_t_ b_in_lno_row_view_t; + typedef b_lno_nnz_view_t_ b_in_lno_nnz_view_t; + typedef b_scalar_nnz_view_t_ b_in_scalar_nnz_view_t; + + typedef typename a_row_view_t::non_const_value_type size_type; + typedef typename a_row_view_t::const_value_type const_size_type; + + typedef typename a_in_lno_nnz_view_t::non_const_value_type nnz_lno_t; + typedef typename a_in_lno_nnz_view_t::const_value_type const_nnz_lno_t; + + typedef typename a_in_scalar_nnz_view_t::non_const_value_type scalar_t; + typedef typename a_in_scalar_nnz_view_t::const_value_type const_scalar_t; + + typedef typename a_row_view_t::const_type const_a_lno_row_view_t; + typedef typename a_row_view_t::non_const_type non_const_a_lno_row_view_t; + + typedef typename a_in_lno_nnz_view_t::const_type const_a_lno_nnz_view_t; + typedef + typename a_in_lno_nnz_view_t::non_const_type non_const_a_lno_nnz_view_t; + + typedef typename a_in_scalar_nnz_view_t::const_type const_a_scalar_nnz_view_t; + typedef typename a_in_scalar_nnz_view_t::non_const_type + non_const_a_scalar_nnz_view_t; + + typedef typename b_in_lno_row_view_t::const_type const_b_lno_row_view_t; + typedef + typename b_in_lno_row_view_t::non_const_type non_const_b_lno_row_view_t; + + typedef typename b_in_lno_nnz_view_t::const_type const_b_lno_nnz_view_t; + typedef + typename b_in_lno_nnz_view_t::non_const_type non_const_b_lno_nnz_view_t; + + typedef typename b_in_scalar_nnz_view_t::const_type const_b_scalar_nnz_view_t; + typedef typename b_in_scalar_nnz_view_t::non_const_type + non_const_b_scalar_nnz_view_t; + + typedef typename HandleType::HandleExecSpace MyExecSpace; + typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; + typedef + typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; + + typedef + typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; + typedef typename HandleType::row_lno_persistent_work_view_t + row_lno_persistent_work_view_t; + typedef typename HandleType::row_lno_persistent_work_host_view_t + row_lno_persistent_work_host_view_t; // Host view type + + typedef + typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_view_t + nnz_lno_persistent_work_view_t; + typedef typename HandleType::nnz_lno_persistent_work_host_view_t + nnz_lno_persistent_work_host_view_t; // Host view type + + typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t; + typedef typename HandleType::scalar_persistent_work_view_t + scalar_persistent_work_view_t; + + typedef typename HandleType::bool_persistent_view_t bool_persistent_view_t; + typedef typename HandleType::bool_temp_view_t bool_temp_view_t; + + typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::TeamPolicy team_policy_t; + typedef typename team_policy_t::member_type team_member_t; + + struct CountTag {}; + struct GPUCountTag {}; + struct CountTag2 {}; + + struct FillTag {}; + struct FillTag2 {}; + struct MultiCoreDenseAccumulatorTag {}; + struct MultiCoreDenseAccumulatorTag2 {}; + struct MultiCoreDenseAccumulatorTag3 {}; + struct NoCompressMultiCoreDenseAccumulatorTag {}; + struct NoCompressMultiCoreDenseAccumulatorTag2 {}; + struct NoCompressMultiCoreDenseAccumulatorTag3 {}; + struct MultiCoreTag {}; + struct MultiCoreTag2 {}; + struct MultiCoreTag3 {}; + struct MultiCoreTag4 {}; + struct MultiCoreTag5 {}; + struct MultiCoreTag6 {}; + struct GPUTag {}; + struct GPUTag2 {}; + struct GPUTag3 {}; + struct GPUTag4 {}; + struct GPUTag5 {}; + struct GPUTag6 {}; + + struct Numeric1Tag {}; + struct Numeric2Tag {}; + struct Numeric3Tag {}; + + typedef Kokkos::TeamPolicy + multicore_dense_team_count_policy_t; + typedef Kokkos::TeamPolicy + multicore_dense_team2_count_policy_t; + typedef Kokkos::TeamPolicy + multicore_dense_team3_count_policy_t; + + typedef Kokkos::TeamPolicy + nc_multicore_dense_team_count_policy_t; + typedef Kokkos::TeamPolicy + nc_multicore_dense_team2_count_policy_t; + typedef Kokkos::TeamPolicy + nc_multicore_dense_team3_count_policy_t; + + typedef Kokkos::TeamPolicy > + nc_dynamic_multicore_dense_team_count_policy_t; + typedef Kokkos::TeamPolicy > + nc_dynamic_multicore_dense_team2_count_policy_t; + typedef Kokkos::TeamPolicy > + nc_dynamic_multicore_dense_team3_count_policy_t; + + typedef Kokkos::TeamPolicy multicore_team_policy_t; + typedef Kokkos::TeamPolicy + multicore_team_policy2_t; + typedef Kokkos::TeamPolicy + multicore_team_policy3_t; + typedef Kokkos::TeamPolicy + multicore_team_policy4_t; + typedef Kokkos::TeamPolicy + multicore_team_policy5_t; + typedef Kokkos::TeamPolicy + multicore_team_policy6_t; + + typedef Kokkos::TeamPolicy gpu_team_policy_t; + typedef Kokkos::TeamPolicy gpu_team_policy2_t; + typedef Kokkos::TeamPolicy gpu_team_policy3_t; + typedef Kokkos::TeamPolicy gpu_team_policy4_t; + typedef Kokkos::TeamPolicy gpu_team_policy5_t; + typedef Kokkos::TeamPolicy gpu_team_policy6_t; + + typedef Kokkos::TeamPolicy team_count_policy_t; + typedef Kokkos::TeamPolicy team_count2_policy_t; + + typedef Kokkos::TeamPolicy team_gpucount_policy_t; + + typedef Kokkos::TeamPolicy team_fill_policy_t; + typedef Kokkos::TeamPolicy team_fill2_policy_t; + + typedef Kokkos::TeamPolicy team_numeric1_policy_t; + typedef Kokkos::TeamPolicy team_numeric2_policy_t; + typedef Kokkos::TeamPolicy team_numeric3_policy_t; + + typedef Kokkos::TeamPolicy > + dynamic_multicore_dense_team_count_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_dense_team2_count_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_dense_team3_count_policy_t; + + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy2_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy3_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy4_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy5_t; + typedef Kokkos::TeamPolicy > + dynamic_multicore_team_policy6_t; + + typedef Kokkos::TeamPolicy > + dynamic_team_count_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_team_fill_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_team_numeric1_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_team_numeric2_policy_t; + typedef Kokkos::TeamPolicy > + dynamic_team_numeric3_policy_t; + + typedef Kokkos::TeamPolicy > + dynamic_team_policy_t; + + private: + HandleType *handle; + nnz_lno_t a_row_cnt; + nnz_lno_t b_row_cnt; + nnz_lno_t b_col_cnt; + + const_a_lno_row_view_t row_mapA; + const_a_lno_nnz_view_t entriesA; + const_a_scalar_nnz_view_t valsA; + bool transposeA; + + const_b_lno_row_view_t row_mapB; + const_b_lno_nnz_view_t entriesB; + const_b_scalar_nnz_view_t valsB; + bool transposeB; + + const size_t shmem_size; + size_t concurrency; + const bool use_dynamic_schedule; + const bool KOKKOSKERNELS_VERBOSE; + // const int KOKKOSKERNELS_VERBOSE = 1; + + const KokkosKernels::Impl::ExecSpaceType MyEnumExecSpace; + const SPGEMMAlgorithm spgemm_algorithm; + const SPGEMMAccumulator spgemm_accumulator; + + ////////////////////////////////////////////////////////////////////////////// + //////Function and Struct for matrix compression. + //////Declerations are at KokkosKernels_SPGEMM_impl_compression.hpp + ////////////////////////////////////////////////////////////////////////////// + + /** + * \brief Given a symbolic matrix (a graph), it compresses the graph using + * bits. \param in_row_map: input row pointers. \param in_entries: input + * column entries \param out_row_map: output row pointers of the compressed + * matrix \param out_nnz_indices: output, column set indices of the output + * matrix. \param out_nnz_sets: output, column sets of the output matrix. + * + */ + template + bool compressMatrix(nnz_lno_t n, size_type nnz, in_row_view_t in_row_map, + in_nnz_view_t in_entries, out_rowmap_view_t out_row_map, + out_nnz_view_t &out_nnz_indices, + out_nnz_view_t &out_nnz_sets, bool singleStep); + + public: + /** + *\brief Functor to zip the B matrix. + */ + template + struct SingleStepZipMatrix; + + private: + ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + ////BELOW code is for triangle count specific. + ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + template + void triangle_count_ai(const int is_symbolic_or_numeric, const nnz_lno_t m, + const size_type *row_mapA_, const nnz_lno_t *entriesA_, + + const size_type bnnz, const size_type *old_row_mapB, + const size_type *row_mapB_, + const nnz_lno_t *entriesSetIndex, + const nnz_lno_t *entriesSets, + + size_type *rowmapC, nnz_lno_t *entriesC, + struct_visit_t visit_applier); + + public: + template + struct TriangleCount; + + template + void KokkosSPGEMM_numeric_triangle(c_row_view_t rowmapC_, + c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_); + + template + void KokkosSPGEMM_symbolic_triangle(c_row_view_t rowmapC_); + template + void KokkosSPGEMM_generic_triangle(visit_struct_t visit_apply); + + /* + template + void KokkosSPGEMM_generic_triangle_no_compression(visit_struct_t visit_apply); + + template + void triangle_count_ai_no_compression( + const nnz_lno_t m, + const size_type* row_mapA_, + const nnz_lno_t * entriesA_, + + const size_type bnnz, + const size_type * rowmapB_begins, + const size_type * rowmapB_ends, + const nnz_lno_t * entriesB, + struct_visit_t visit_applier); + */ + void KokkosSPGEMM_symbolic_triangle_setup(); + + private: + template + void KokkosSPGEMM_numeric_triangle_ai(c_row_view_t rowmapC_, + c_lno_nnz_view_t entriesC_); + + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO for SPEED SPGEMM + ////DECL IS AT _speed.hpp + ////////////////////////////////////////////////////////////////////////// + template + struct NumericCMEM_CPU; + + template + struct NumericCMEM; + + private: + /** + * \brief Numeric phase with speed method + */ + template + void KokkosSPGEMM_numeric_speed( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + + public: + /* + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO for colored SPGEMM + ////DECL IS AT _color.hpp + ////////////////////////////////////////////////////////////////////////// + template struct NumericCCOLOR; + */ + private: + /** + * \brief Numeric phase with speed method + */ + /* + template void KokkosSPGEMM_numeric_color( c_row_view_t rowmapC_, + c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + SPGEMMAlgorithm spgemm_algorithm); + + template + void d2_color_c_matrix( + c_row_view_t rowmapC, + c_nnz_view_t entryIndicesC_, + + nnz_lno_t &original_num_colors, + nnz_lno_persistent_work_host_view_t &h_color_xadj, + nnz_lno_persistent_work_view_t &color_adj, + nnz_lno_persistent_work_view_t &vertex_colors_to_store, + + nnz_lno_t &num_colors_in_one_step, + nnz_lno_t &num_multi_color_steps, + SPGEMMAlgorithm spgemm_algorithm); + */ + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO for kkmem SPGEMM + ////DECL IS AT _kkmem.hpp + ////////////////////////////////////////////////////////////////////////// + template + struct PortableNumericCHASH; + + private: + // KKMEM only difference is work memory does not use output memory for 2nd + // level accumulator. + template + void KokkosSPGEMM_numeric_hash2( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + + template + void KokkosSPGEMM_numeric_hash( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); +#if defined(KOKKOS_ENABLE_OPENMP) +#ifdef KOKKOSKERNELS_HAVE_OUTER + public: + // OUTER PRODUCT CODES + struct Triplet; + + template + struct OuterProduct; + + template + struct FlopsPerRowOuter; + + private: + template + void sort_triplets(triplet_view_t triplets, size_t num_triplets); + + template + void merge_triplets_on_slow_memory(host_triplet_view_t *triplets, + size_t num_blocks, size_t overall_size, + host_triplet_view_t output_triplets); + + template + size_t final_collapse_triplets_omp(triplet_view_t triplets, + size_t num_triplets, + c_row_view_t &rowmapC_, + c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_); + + template + size_t collapse_triplets(triplet_view_t triplets, size_t num_triplets); + + template + size_t collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets, + triplet_view_t out_triplets); + +#endif +#endif + + template + void KokkosSPGEMM_numeric_outer( + c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + ////////////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////////////// + +#ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO CALCULATE MEMORY ACCESSES WITH HYPERGRAPH MODEL///// + ////DECL IS AT _memaccess.hpp + ////////////////////////////////////////////////////////////////////////// + public: + // Functor to calculate how many flops is performed per row of C. + template + struct FlopsPerRow; + struct Cache; + + private: + void create_read_write_hg(size_t &overall_flops, + row_lno_temp_work_view_t &c_flop_rowmap, + row_lno_temp_work_view_t &c_comp_a_net_index, + row_lno_temp_work_view_t &c_comp_b_net_index, + nnz_lno_temp_work_view_t &c_comp_row_index, + nnz_lno_temp_work_view_t &c_comp_col_index); + + template + void print_read_write_cost(c_row_view_t rowmapC); + + template + void read_write_cost( + nnz_lno_t num_colors, nnz_lno_t num_multi_colors, + nnz_lno_t num_parallel_colors, bool isGPU, int num_cores, + + nnz_lno_t num_hyperthreads_in_core, nnz_lno_t hyper_threads_in_team, + + int vectorlane, const int cache_line_size, const int data_size, + const int cache_size, + + nnz_lno_persistent_work_host_view_t color_xadj, + typename nnz_lno_persistent_work_view_t::HostMirror color_adj, + typename nnz_lno_persistent_work_view_t::HostMirror vertex_colors, + + size_t overall_flops, + typename row_lno_temp_work_view_t::HostMirror c_flop_rowmap, + typename row_lno_temp_work_view_t::HostMirror c_comp_a_net_index, + typename row_lno_temp_work_view_t::HostMirror c_comp_b_net_index, + typename nnz_lno_temp_work_view_t::HostMirror c_comp_row_index, + typename nnz_lno_temp_work_view_t::HostMirror c_comp_col_index, + c_row_view_t rowmapC, + int write_type // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR + // 4-KKMULTICOLOR2 + ); + +#endif + + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS for public symbolic and numeric functions + ////DECL IS AT _def.hpp + ////////////////////////////////////////////////////////////////////////// + template + void KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_); + // TODO: These are references only for outer product algorithm. + // If the algorithm is removed, then remove the references. + + /** + * \brief Symbolic phase of the SPGEMM. + * \param rowmapC_: row pointers for the result matrix. Allocated before the + * call with size (n+1), where n is the number of rows of first matrix. + */ + template + void KokkosSPGEMM_symbolic(c_row_view_t rowmapC_); + + template + void write_matrix_to_plot(nnz_lno_t &num_colors, + nnz_lno_persistent_work_host_view_t &h_color_xadj, + nnz_lno_persistent_work_view_t &color_adj, + c_row_view_t &rowmapC, + c_nnz_view_t &entryIndicesC_); + + KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, bool transposeB_) + : handle(handle_), + a_row_cnt(m_), + b_row_cnt(n_), + b_col_cnt(k_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valsA(), + transposeA(transposeA_), + row_mapB(row_mapB_), + entriesB(entriesB_), + valsB(), + transposeB(transposeB_), + shmem_size(handle_->get_shmem_size()), + concurrency(MyExecSpace::concurrency()), + use_dynamic_schedule(handle_->is_dynamic_scheduling()), + KOKKOSKERNELS_VERBOSE(handle_->get_verbose()), + MyEnumExecSpace(this->handle->get_handle_exec_space()), + spgemm_algorithm( + this->handle->get_spgemm_handle()->get_algorithm_type()), + spgemm_accumulator( + this->handle->get_spgemm_handle()->get_accumulator_type()) + //,row_mapC(), entriesC(), valsC() + {} + + KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, + const_a_scalar_nnz_view_t valsA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, + const_b_scalar_nnz_view_t valsB_, bool transposeB_) + : handle(handle_), + a_row_cnt(m_), + b_row_cnt(n_), + b_col_cnt(k_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valsA(valsA_), + transposeA(transposeA_), + row_mapB(row_mapB_), + entriesB(entriesB_), + valsB(valsB_), + transposeB(transposeB_), + shmem_size(handle_->get_shmem_size()), + concurrency(MyExecSpace::concurrency()), + use_dynamic_schedule(handle_->is_dynamic_scheduling()), + KOKKOSKERNELS_VERBOSE(handle_->get_verbose()), + MyEnumExecSpace(this->handle->get_handle_exec_space()), + spgemm_algorithm( + this->handle->get_spgemm_handle()->get_algorithm_type()), + spgemm_accumulator( + this->handle->get_spgemm_handle()->get_accumulator_type()) + //,row_mapB(), entriesC(), valsC() + {} + + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS for symbolic phase + ////DECL IS AT _symbolic.hpp + ////////////////////////////////////////////////////////////////////////// + public: + /*** + * \brief Functor to calculate the row sizes of C. + */ + template + struct StructureC; + + template + struct StructureC_NC; + + template + struct NonzeroesC; + + /** + * \brief Functor to calculate the max flops in a row of SPGEMM. + * + */ + template + struct PredicMaxRowNNZ; + + struct PredicMaxRowNNZIntersection; + struct PredicMaxRowNNZ_p; + + private: + /** + * \brief function return max flops for a row in the result multiplication. + * \param m: number of rows in A + * \param row_mapA: row pointers of A. + * \param entriesA: column indices of A + * \param row_pointers_begin_B: beginning of the row indices for B + * \param row_pointers_end_B: end of the row indices for B + */ + template + size_t getMaxRoughRowNNZ(nnz_lno_t m, a_row_view_t row_mapA_, + a_nnz_view_t entriesA_, + + b_oldrow_view_t row_pointers_begin_B, + b_row_view_t row_pointers_end_B, + size_type *flops_per_row = NULL); + + size_t getMaxRoughRowNNZ_p(const nnz_lno_t m, const size_type annz, + const size_type *row_mapA_, + const nnz_lno_t *entriesA_, + + const size_type *row_pointers_begin_B, + const size_type *row_pointers_end_B); + + size_t getMaxRoughRowNNZIntersection_p( + const nnz_lno_t m, const size_type annz, const size_type *row_mapA_, + const nnz_lno_t *entriesA_, + + const size_type *row_pointers_begin_B, + const size_type *row_pointers_end_B, + nnz_lno_t *min_result_row_for_each_row); + + template + void symbolic_c(nnz_lno_t m, a_r_view_t row_mapA_, a_nnz_view_t entriesA_, + + b_original_row_view_t old_row_mapB, + b_compressed_row_view_t row_mapB_, + b_nnz_view_t entriesSetIndex, b_nnz_view_t entriesSets, + + c_row_view_t rowmapC, nnz_lno_t maxNumRoughNonzeros); + + template + void symbolic_c_no_compression(nnz_lno_t m, a_r_view_t row_mapA_, + a_nnz_view_t entriesA_, + + b_original_row_view_t b_rowmap_begin, + b_compressed_row_view_t b_rowmap_end, + b_nnz_view_t entriesb_, c_row_view_t rowmapC, + nnz_lno_t maxNumRoughNonzeros); + + ////////////////////////////////////////////////////////////////////////// + ///// Jacobi-fused SpGEMM declarations + ////////////////////////////////////////////////////////////////////////// + public: + template < + typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t, + typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t, + typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t, + typename dinv_view_t, typename pool_memory_type> + struct JacobiSpGEMMSparseAcc; + + template + struct JacobiSpGEMMDenseAcc; + + template + void KokkosSPGEMM_jacobi_sparseacc( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space); + + private: + template + void KokkosSPGEMM_jacobi_denseacc( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + + // Utility to compute the number of pool chunks for L2 hashmap accumulators. + // Uses free memory query for accelerators/GPUs but assumes infinite available + // host memory. + // + // chunk_bytes: bytes in each chunk + // ideal_num_chunks: number of chunks that would give each thread/team its own + // chunk (no contention) + template + size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) { + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename Pool::execution_space>()) + return ideal_num_chunks; + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory( + free_byte, total_byte); + size_t required_size = ideal_num_chunks * chunk_bytes; + if (KOKKOSKERNELS_VERBOSE) + std::cout << "\tmempool required size:" << required_size + << " free_byte:" << free_byte << " total_byte:" << total_byte + << std::endl; + size_t num_chunks = ideal_num_chunks; + // If there is not enough memory to safely allocate ideal_num_chunks, use + // half the free memory, rounded down + if (required_size > free_byte / 2) { + num_chunks = (free_byte / 2) / chunk_bytes; + } + // then take the largest power of 2 smaller than that + size_t po2_num_chunks = 1; + while (po2_num_chunks * 2 < num_chunks) { + po2_num_chunks *= 2; + } + return po2_num_chunks; + } +}; + +} // namespace Impl +} // namespace KokkosSparse +#include "KokkosSparse_spgemm_imp_outer.hpp" +#include "KokkosSparse_spgemm_impl_memaccess.hpp" +#include "KokkosSparse_spgemm_impl_kkmem.hpp" +#include "KokkosSparse_spgemm_impl_speed.hpp" +#include "KokkosSparse_spgemm_impl_compression.hpp" +#include "KokkosSparse_spgemm_impl_def.hpp" +#include "KokkosSparse_spgemm_impl_symbolic.hpp" +#include "KokkosSparse_spgemm_impl_triangle.hpp" +#endif diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp new file mode 100644 index 0000000000..173a58b568 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp @@ -0,0 +1,294 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { + +namespace Impl { + +template +template +void KokkosSPGEMM< + HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_, + b_lno_row_view_t_, b_lno_nnz_view_t_, + b_scalar_nnz_view_t_>::KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, + c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_) { + // get the algorithm and execution space. + // SPGEMMAlgorithm spgemm_algorithm = + // this->handle->get_spgemm_handle()->get_algorithm_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space_ = + KokkosKernels::Impl::get_exec_space_type(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "Numeric PHASE" << std::endl; + } + + if (spgemm_algorithm == SPGEMM_KK_SPEED || + spgemm_algorithm == SPGEMM_KK_DENSE) { + this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + my_exec_space_); + } else { + this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, + my_exec_space_); + } +} + +template +template +void KokkosSPGEMM::KokkosSPGEMM_symbolic(c_row_view_t + rowmapC_) { + { + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "SYMBOLIC PHASE" << std::endl; + } + // first calculate the number of original flops required. + { + nnz_lno_t maxNumRoughZeros = 0; + size_t overall_flops = 0; + Kokkos::Timer timer1; + auto new_row_mapB_begin = + Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(0), b_row_cnt)); + auto new_row_mapB_end = Kokkos::subview( + row_mapB, std::make_pair(nnz_lno_t(1), b_row_cnt + 1)); + row_lno_persistent_work_view_t flops_per_row( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"), + a_row_cnt); + + // get maximum row flops. + maxNumRoughZeros = this->getMaxRoughRowNNZ( + a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, + flops_per_row.data()); + + // calculate overal flops. + KokkosKernels::Impl::kk_reduce_view2( + a_row_cnt, flops_per_row, overall_flops); + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros + << std::endl; + std::cout << "\tOriginal overall_flops Flops:" << overall_flops + << std::endl; + std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds() + << std::endl; + } + this->handle->get_spgemm_handle()->original_max_row_flops = + maxNumRoughZeros; + this->handle->get_spgemm_handle()->original_overall_flops = overall_flops; + this->handle->get_spgemm_handle()->row_flops = flops_per_row; + } + + // number of rows and nnzs + nnz_lno_t n = this->row_mapB.extent(0) - 1; + size_type nnz = this->entriesB.extent(0); + + bool compress_in_single_step = + this->handle->get_spgemm_handle()->get_compression_step(); + // compress in single step if it is GPU. + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) + compress_in_single_step = true; + + // compressed B fields. + row_lno_temp_work_view_t new_row_mapB( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1); + row_lno_temp_work_view_t new_row_mapB_begins; + + nnz_lno_temp_work_view_t + set_index_entries; // will be output of compress matrix. + nnz_lno_temp_work_view_t set_entries; // will be output of compress matrix + + // First Compress B. + Kokkos::Timer timer1; + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\tCOMPRESS MATRIX-B PHASE" << std::endl; + } + + // call compression. + // it might not go through to the end if ratio is not high. + bool compression_applied = this->compressMatrix( + n, nnz, this->row_mapB, this->entriesB, new_row_mapB, set_index_entries, + set_entries, compress_in_single_step); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds() + << std::endl + << std::endl; + } + + timer1.reset(); + + // first get the max flops for a row, which will be used for max row size. + // If we did compression in single step, row_mapB[i] points the begining of + // row i, and new_row_mapB[i] points to the end of row i. + + if (compression_applied) { + nnz_lno_t maxNumRoughZeros = + this->handle->get_spgemm_handle()->compressed_max_row_flops; + + if (compress_in_single_step) { + // calling symbolic structure + this->symbolic_c(a_row_cnt, row_mapA, entriesA, row_mapB, new_row_mapB, + set_index_entries, set_entries, rowmapC_, + maxNumRoughZeros); + + } else { + nnz_lno_t begin = 0; + auto new_row_mapB_begin = + Kokkos::subview(new_row_mapB, std::make_pair(begin, n)); + auto new_row_mapB_end = + Kokkos::subview(new_row_mapB, std::make_pair(begin + 1, n + 1)); + + // calling symbolic structure + this->symbolic_c(a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, + new_row_mapB_end, set_index_entries, set_entries, + rowmapC_, maxNumRoughZeros); + } + } else { + new_row_mapB = row_lno_temp_work_view_t(); + new_row_mapB_begins = row_lno_temp_work_view_t(); + set_index_entries = nnz_lno_temp_work_view_t(); + set_entries = nnz_lno_temp_work_view_t(); + nnz_lno_t maxNumRoughZeros = + this->handle->get_spgemm_handle()->original_max_row_flops; + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "SYMBOLIC PHASE -- NO COMPRESSION: maxNumRoughZeros:" + << maxNumRoughZeros << std::endl; + } + + auto new_row_mapB_begin = + Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(0), n)); + auto new_row_mapB_end = + Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(1), n + 1)); + + // calling symbolic structure + this->symbolic_c_no_compression( + a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, + this->entriesB, rowmapC_, maxNumRoughZeros); + } +#ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS + double read_write_cost = + this->handle->get_spgemm_handle()->get_read_write_cost_calc(); + if (read_write_cost) { + this->print_read_write_cost(rowmapC_); + } +#endif + } +} + +template +template +void KokkosSPGEMM:: + write_matrix_to_plot(nnz_lno_t &num_colors, + nnz_lno_persistent_work_host_view_t &h_color_xadj, + nnz_lno_persistent_work_view_t &color_adj, + c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_) { + std::cout << "writing to plot" << std::endl; + + nnz_lno_persistent_work_host_view_t h_color_adj = + Kokkos::create_mirror_view(color_adj); + Kokkos::deep_copy(h_color_adj, color_adj); + auto h_rowmapC = Kokkos::create_mirror_view(rowmapC); + Kokkos::deep_copy(h_rowmapC, rowmapC); + auto h_entryIndicesC = Kokkos::create_mirror_view(entryIndicesC_); + Kokkos::deep_copy(h_entryIndicesC, entryIndicesC_); + + for (nnz_lno_t i = 0; i < num_colors; ++i) { + nnz_lno_t color_begin = h_color_xadj(i); + nnz_lno_t color_end = h_color_xadj(i + 1); + + std::string colorind = ""; + std::stringstream ss; + ss << i; + + ss >> colorind; + colorind += ".coords"; + std::fstream fs; + fs.open(colorind.c_str(), std::fstream::out); + + std::cout << "COLOR:" << i << " colorbegin:" << color_begin + << " colorend:" << color_end + << " size:" << color_end - color_begin << std::endl; + for (nnz_lno_t j = color_begin; j < color_end; ++j) { + nnz_lno_t row = h_color_adj(j); + for (size_type k = h_rowmapC(row); k < h_rowmapC(row + 1); ++k) { + nnz_lno_t column = h_entryIndicesC(k); + // std::cout << row << " " << column << std::endl; + fs << row << " " << column << std::endl; + } + } + fs.close(); + } + + std::fstream fs; + fs.open("plot1.gnuplot", std::fstream::out); + for (nnz_lno_t i = 0; i < num_colors; ++i) { + std::string colorind = "\""; + std::stringstream ss; + ss << i; + + ss >> colorind; + colorind += ".coords\""; + if (i > 0) fs << "re"; + fs << "plot " << colorind << std::endl; + } + fs << "pause -1" << std::endl; + fs.close(); +} + +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp new file mode 100644 index 0000000000..94cec7af04 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -0,0 +1,1855 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define HASHSCALAR 107 + +#include "KokkosKernels_Utils.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +template +struct KokkosSPGEMM::PortableNumericCHASH { + nnz_lno_t numrows; + + a_row_view_t row_mapA; + a_nnz_view_t entriesA; + a_scalar_view_t valuesA; + + b_row_view_t row_mapB; + b_nnz_view_t entriesB; + b_scalar_view_t valuesB; + + c_row_view_t rowmapC; + c_nnz_view_t entriesC; + c_scalar_view_t valuesC; + + nnz_lno_t *pEntriesC; + scalar_t *pvaluesC; + const size_t shared_memory_size; + const int vector_size; + pool_memory_type memory_space; + + // nnz_lno_t max_nnz; + const nnz_lno_t pow2_hash_size; + const nnz_lno_t max_nnz; + const nnz_lno_t pow2_hash_func; + const KokkosKernels::Impl::ExecSpaceType my_exec_space; + const nnz_lno_t team_work_size; + + const int unit_memory; // begins, nexts, and keys. No need for vals yet. + const int suggested_team_size; + const int thread_memory; + nnz_lno_t thread_shmem_key_size; + nnz_lno_t thread_shared_memory_hash_func; + nnz_lno_t thread_shmem_hash_size; + + nnz_lno_t team_shmem_key_size; + nnz_lno_t team_shared_memory_hash_func; + nnz_lno_t team_shmem_hash_size; + + nnz_lno_t team_cuckoo_key_size, team_cuckoo_hash_func; + + nnz_lno_t max_first_level_hash_size; + row_lno_persistent_work_view_t flops_per_row; + + PortableNumericCHASH( + nnz_lno_t m_, a_row_view_t row_mapA_, a_nnz_view_t entriesA_, + a_scalar_view_t valuesA_, + + b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_, + + c_row_view_t rowmapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_, + size_t shared_memory_size_, int vector_size_, pool_memory_type mpool_, + nnz_lno_t min_hash_size, nnz_lno_t max_nnz_, int suggested_team_size_, + const KokkosKernels::Impl::ExecSpaceType my_exec_space_, + nnz_lno_t team_row_chunk_size, double first_level_cut_off, + row_lno_persistent_work_view_t flops_per_row_, + bool KOKKOSKERNELS_VERBOSE_) + : numrows(m_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + pEntriesC(entriesC_.data()), + pvaluesC(valuesC_.data()), + shared_memory_size(shared_memory_size_ / 8 * 8), + vector_size(vector_size_), + memory_space(mpool_), + // max_nnz(), + pow2_hash_size(min_hash_size), + max_nnz(max_nnz_), + pow2_hash_func(min_hash_size - 1), + my_exec_space(my_exec_space_), + team_work_size(team_row_chunk_size), + + unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + + sizeof(scalar_t)), + suggested_team_size(suggested_team_size_), + thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8), + thread_shmem_key_size(), + thread_shared_memory_hash_func(), + thread_shmem_hash_size(1), + team_shmem_key_size(), + team_shared_memory_hash_func(), + team_shmem_hash_size(1), + team_cuckoo_key_size(1), + team_cuckoo_hash_func(1), + max_first_level_hash_size(1), + flops_per_row(flops_per_row_) + + { + nnz_lno_t tmp_team_cuckoo_key_size = + ((shared_memory_size - sizeof(nnz_lno_t) * 2) / + (sizeof(nnz_lno_t) + sizeof(scalar_t))); + + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + team_cuckoo_hash_func = team_cuckoo_key_size - 1; + // How many extra bytes are needed to align a scalar_t after an array of + // nnz_lno_t, in the worst case? + constexpr size_t scalarAlignPad = + (alignof(scalar_t) > alignof(nnz_lno_t)) + ? (alignof(scalar_t) - alignof(nnz_lno_t)) + : 0; + team_shmem_key_size = + ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- sizeof(scalar_t): " + << sizeof(scalar_t) + << " sizeof(nnz_lno_t): " << sizeof(nnz_lno_t) + << " suggested_team_size: " << suggested_team_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size + << std::endl; + } + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + while (team_shmem_hash_size * 2 <= team_shmem_key_size) { + team_shmem_hash_size = team_shmem_hash_size * 2; + } + team_shared_memory_hash_func = team_shmem_hash_size - 1; + thread_shared_memory_hash_func = thread_shmem_hash_size - 1; + team_shmem_key_size = + team_shmem_key_size + + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + team_shmem_key_size = (team_shmem_key_size >> 1) << 1; + + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " resized key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " resized team key size:" << team_shmem_key_size + << std::endl; + } + + max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size; + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" + << thread_shmem_hash_size + << " thread_shmem_key_size:" << thread_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" + << team_shmem_hash_size + << " team_shmem_key_size:" << team_shmem_key_size << std::endl; + std::cout << "\t\t team_cuckoo_key_size:" << team_cuckoo_key_size + << " team_cuckoo_hash_func:" << team_cuckoo_hash_func + << " max_first_level_hash_size:" << max_first_level_hash_size + << std::endl; + std::cout << "\t\t pow2_hash_size:" << pow2_hash_size + << " pow2_hash_func:" << pow2_hash_func << std::endl; + } + } + + KOKKOS_INLINE_FUNCTION + size_t get_thread_id(const size_t row_index) const { + switch (my_exec_space) { + default: return row_index; +#if defined(KOKKOS_ENABLE_SERIAL) + case KokkosKernels::Impl::Exec_SERIAL: return 0; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + case KokkosKernels::Impl::Exec_OMP: + return Kokkos::OpenMP::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + case KokkosKernels::Impl::Exec_THREADS: + return Kokkos::Threads::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_CUDA) + case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined(KOKKOS_ENABLE_HIP) + case KokkosKernels::Impl::Exec_HIP: return row_index; +#endif + } + } + + // linear probing with tracking. + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag4 &, + const team_member_t &teamMember) const { + const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + volatile nnz_lno_t *tmp = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (tmp == NULL) { + tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid)); + } + + nnz_lno_t *used_indices = (nnz_lno_t *)(tmp); + tmp += max_nnz; + nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + + scalar_t *hash_values = + KokkosKernels::Impl::alignPtr(tmp); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + nnz_lno_t used_count = 0; + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + for (nnz_lno_t ii = 0; ii < left_work; ++ii) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; + + for (nnz_lno_t i = 0; i < left_workB; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + scalar_t b_val = valuesB[adjind] * valA; + nnz_lno_t hash = (b_col_ind * HASHSCALAR) & pow2_hash_func; + + while (true) { + if (hash_ids[hash] == -1) { + used_indices[used_count++] = hash; + hash_ids[hash] = b_col_ind; + hash_values[hash] = b_val; + break; + } else if (hash_ids[hash] == b_col_ind) { + hash_values[hash] += b_val; + break; + } else { + hash = (hash + 1) & pow2_hash_func; + } + } + } + } + size_type c_row_begin = rowmapC[row_index]; + for (nnz_lno_t i = 0; i < used_count; ++i) { + nnz_lno_t used_index = used_indices[i]; + pEntriesC[c_row_begin] = hash_ids[used_index]; + pvaluesC[c_row_begin++] = hash_values[used_index]; + hash_ids[used_index] = -1; + } + }); + memory_space.release_chunk(used_indices); + } + + // assumes that the vector lane is 1, as in cpus + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag &, const team_member_t &teamMember) const { + const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL); + + volatile nnz_lno_t *tmp = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (tmp == NULL) { + tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid)); + } + nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp; + tmp += pow2_hash_size; + + hm2.hash_begins = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + hm2.hash_nexts = (nnz_lno_t *)(tmp); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + nnz_lno_t globally_used_hash_count = 0; + nnz_lno_t used_hash_sizes = 0; + + const size_type c_row_begin = rowmapC[row_index]; + + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin; + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + + for (nnz_lno_t ii = 0; ii < left_work; ++ii) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; + + for (nnz_lno_t i = 0; i < left_workB; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + scalar_t b_val = valuesB[adjind] * valA; + // nnz_lno_t hash = (b_col_ind * 107) & pow2_hash_func; + + // this has to be a success, we do not need to check for the + // success. int insertion = + hm2.sequential_insert_into_hash_mergeAdd_TrackHashes( + b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count, + globally_used_hash_indices); + } + } + for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) { + nnz_lno_t dirty_hash = globally_used_hash_indices[i]; + hm2.hash_begins[dirty_hash] = -1; + } + }); + memory_space.release_chunk(globally_used_hash_indices); + } + + // assumes that the vector lane is 1, as in cpus + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag2 &, + const team_member_t &teamMember) const { + const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + volatile nnz_lno_t *tmp = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + nnz_lno_t chunk_size = 0; + + while (tmp == NULL) { + tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid)); + // issue-508, TODO: chunk_size = ??? + } + nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp; + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm2(chunk_size, pow2_hash_func, NULL, NULL, NULL, NULL); + + tmp += pow2_hash_size; + + hm2.hash_begins = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + hm2.hash_nexts = (nnz_lno_t *)(tmp); + tmp += max_nnz; + + hm2.keys = (nnz_lno_t *)(tmp); + tmp += max_nnz; + hm2.values = + KokkosKernels::Impl::alignPtr(tmp); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + nnz_lno_t globally_used_hash_count = 0; + nnz_lno_t used_hash_sizes = 0; + + const size_type c_row_begin = rowmapC[row_index]; + const size_type c_row_end = rowmapC[row_index + 1]; + + const nnz_lno_t global_memory_hash_size = + nnz_lno_t(c_row_end - c_row_begin); + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + for (nnz_lno_t ii = 0; ii < left_work; ++ii) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; + + for (nnz_lno_t i = 0; i < left_workB; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + scalar_t b_val = valuesB[adjind] * valA; + nnz_lno_t hash = b_col_ind & pow2_hash_func; + + // this has to be a success, we do not need to check for the + // success. int insertion = + hm2.sequential_insert_into_hash_mergeAdd_TrackHashes( + b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count, + globally_used_hash_indices); + } + } + for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) { + nnz_lno_t dirty_hash = globally_used_hash_indices[i]; + hm2.hash_begins[dirty_hash] = -1; + } + for (nnz_lno_t i = 0; i < global_memory_hash_size; ++i) { + pEntriesC[c_row_begin + i] = hm2.keys[i]; + pvaluesC[c_row_begin + i] = hm2.values[i]; + } + }); + memory_space.release_chunk(globally_used_hash_indices); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + // int thread_memory = (shared_memory_size / 8 / teamMember.team_size()) * + // 8; + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shift it to the thread private part + all_shared_memory += thread_memory * teamMember.team_rank(); + + // used_hash_sizes hold the size of 1st and 2nd level hashes + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof + // (scalar_t) ; //begins, nexts, keys and vals . nnz_lno_t shmem_key_size = + // (thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory; if (shmem_key_size + // & 1) shmem_key_size -= 1; + + nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size; + + // points to the next elements + nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size; + // remainder of shmem allocation for vals + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm(thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts, + keys, vals); + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + const size_type c_row_end = rowmapC[row_index + 1]; + const nnz_lno_t global_memory_hash_size = + nnz_lno_t(c_row_end - c_row_begin); + + bool is_global_alloced = false; + nnz_lno_t *globally_used_hash_indices = NULL; + + if (global_memory_hash_size > thread_shmem_key_size) { + volatile nnz_lno_t *tmp = NULL; + // size_t tid = get_thread_id(row_index); + // the code gets internal compiler error on gcc 4.7.2 + // assuming that this part only runs on GPUs for now, below fix + // has the exact same behaviour and runs okay. + size_t tid = row_index; + + while (tmp == NULL) { + Kokkos::single( + Kokkos::PerThread(teamMember), + [&](volatile nnz_lno_t *&memptr) { + memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk( + tid)); + }, + tmp); + } + + is_global_alloced = true; + globally_used_hash_indices = (nnz_lno_t *)tmp; + tmp += pow2_hash_size; + hm2.hash_begins = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + hm2.hash_nexts = (nnz_lno_t *)(tmp); + } + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin; + + // initialize begins. + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size), + [&](nnz_lno_t i) { begins[i] = -1; }); + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + globally_used_hash_count[0] = 0; + }); + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + nnz_lno_t ii = left_work; + // for ( nnz_lno_t ii = 0; ii < left_work; ++ii){ + while (ii-- > 0) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, left_work_), + [&](nnz_lno_t i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + scalar_t b_val = valuesB[adjind] * valA; + volatile int num_unsuccess = + hm.vector_atomic_insert_into_hash_mergeAdd( + b_col_ind, b_val, used_hash_sizes); + if (num_unsuccess) { + hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes( + b_col_ind, b_val, used_hash_sizes + 1, + globally_used_hash_count, globally_used_hash_indices); + } + }); + } + + if (is_global_alloced) { + nnz_lno_t dirty_hashes = globally_used_hash_count[0]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, dirty_hashes), + [&](nnz_lno_t i) { + nnz_lno_t dirty_hash = globally_used_hash_indices[i]; + hm2.hash_begins[dirty_hash] = -1; + }); + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + memory_space.release_chunk(globally_used_hash_indices); + }); + } + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + if (used_hash_sizes[0] > thread_shmem_key_size) + used_hash_sizes[0] = thread_shmem_key_size; + }); + + nnz_lno_t num_elements = used_hash_sizes[0]; + + nnz_lno_t written_index = used_hash_sizes[1]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, num_elements), + [&](nnz_lno_t i) { + pEntriesC[c_row_begin + written_index + i] = keys[i]; + pvaluesC[c_row_begin + written_index + i] = vals[i]; + }); + }); + } + + // one row does not fit into shmem, with thread-flat-parallel + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag6 &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + + // sizeof(scalar_t)*nvals + const nnz_lno_t init_value = -1; + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + int thread_rank = teamMember.team_rank(); + + int vector_rank = 0; + typedef typename std::remove_reference::type + atomic_incr_type; + Kokkos::parallel_scan( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &update, const bool final) { + if (final) { + vector_rank = update; + } + update += 1; + }); + int bs = vector_size * suggested_team_size; + int vector_shift = thread_rank * vector_size + vector_rank; + + for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; + ++row_index) { + if (row_mapA[row_index] == row_mapA[row_index + 1]) // skip empty A rows + continue; +#if 1 + teamMember.team_barrier(); +#endif + const size_type c_row_begin = rowmapC[row_index]; + const size_type c_row_end = rowmapC[row_index + 1]; + const nnz_lno_t c_row_size = c_row_end - c_row_begin; + nnz_lno_t *c_row = entriesC.data() + c_row_begin; + scalar_t *c_row_vals = valuesC.data() + c_row_begin; + nnz_lno_t *global_acc_row_keys = c_row; + scalar_t *global_acc_row_vals = c_row_vals; + volatile nnz_lno_t *tmp = NULL; + + if (c_row_size > max_first_level_hash_size) { + { + while (tmp == NULL) { + Kokkos::single( + Kokkos::PerTeam(teamMember), + [&](volatile nnz_lno_t *&memptr) { + memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk( + row_index)); + }, + tmp); + } + global_acc_row_keys = (nnz_lno_t *)(tmp); + global_acc_row_vals = + KokkosKernels::Impl::alignPtr( + tmp + pow2_hash_size); + } + // initialize begins. + { + nnz_lno_t num_threads = pow2_hash_size / vector_size; + // not needed as team_cuckoo_key_size is always pow2. + + // (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + global_acc_row_vals[teamind * vector_size + i] = 0; + }); + }); + } + } + + // initialize begins. + { + nnz_lno_t num_threads = team_cuckoo_key_size / vector_size; + // not needed as team_cuckoo_key_size is always pow2. + + // (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + keys[teamind * vector_size + i] = init_value; + vals[teamind * vector_size + i] = 0; + }); + }); + } + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + }); + + bool insert_is_on = true; + const size_type a_col_begin_offset = row_mapA[row_index]; + + nnz_lno_t a_col_ind = entriesA[a_col_begin_offset]; + scalar_t a_col_val = valuesA[a_col_begin_offset]; + + nnz_lno_t current_a_column_offset_inrow = 0; + nnz_lno_t flops_on_the_left_of_offsett = 0; + size_type current_b_read_offsett = row_mapB[a_col_ind]; + nnz_lno_t current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + + nnz_lno_t row_flops = flops_per_row(row_index); + +#if 1 + teamMember.team_barrier(); +#endif + for (nnz_lno_t vector_read_shift = vector_shift; + vector_read_shift < row_flops; vector_read_shift += bs) { + { + nnz_lno_t my_b_col_shift = + vector_read_shift - flops_on_the_left_of_offsett; + nnz_lno_t my_b_col = init_value; + scalar_t my_b_val = 0; + nnz_lno_t hash = init_value; + int fail = 0; + + if (my_b_col_shift >= current_a_column_flops) { + do { + ++current_a_column_offset_inrow; + my_b_col_shift -= current_a_column_flops; + flops_on_the_left_of_offsett += current_a_column_flops; + a_col_ind = + entriesA[a_col_begin_offset + current_a_column_offset_inrow]; + + current_b_read_offsett = row_mapB[a_col_ind]; + current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + } while (my_b_col_shift >= current_a_column_flops); + a_col_val = + valuesA[a_col_begin_offset + current_a_column_offset_inrow]; + } + + my_b_col = entriesB[my_b_col_shift + current_b_read_offsett]; + my_b_val = + valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val; + // now insert it to first level hashmap accumulator. + hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + fail = 1; + bool try_to_insert = true; + + // nnz_lno_t max_tries = team_cuckoo_key_size; + nnz_lno_t search_end = + team_cuckoo_key_size; // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size, + // hash + max_tries); + for (nnz_lno_t trial = hash; trial < search_end;) { + if (keys[trial] == my_b_col) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (!insert_is_on) { + try_to_insert = false; + break; + } else if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + Kokkos::atomic_add(vals + trial, my_b_val); + Kokkos::atomic_increment(used_hash_sizes); + if (used_hash_sizes[0] > max_first_level_hash_size) + insert_is_on = false; + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + search_end = hash; // max_tries - (team_cuckoo_key_size - hash); + + for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) { + if (keys[trial] == my_b_col) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (!insert_is_on) { + break; + } else if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + Kokkos::atomic_add(vals + trial, my_b_val); + Kokkos::atomic_increment(used_hash_sizes); + if (used_hash_sizes[0] > max_first_level_hash_size) + insert_is_on = false; + fail = 0; + break; + } + } else { + ++trial; + } + } + + if (fail) { + nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func; + + for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) { + if (global_acc_row_keys[trial] == my_b_col) { + Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val); + + // c_row_vals[trial] += my_b_val; + fail = 0; + break; + } else if (global_acc_row_keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + global_acc_row_keys + trial, init_value, my_b_col)) { + Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val); + // Kokkos::atomic_increment(used_hash_sizes + 1); + // c_row_vals[trial] = my_b_val; + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + for (nnz_lno_t trial = 0; trial < new_hash;) { + if (global_acc_row_keys[trial] == my_b_col) { + // c_row_vals[trial] += my_b_val; + Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val); + + break; + } else if (global_acc_row_keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + global_acc_row_keys + trial, init_value, + my_b_col)) { + // Kokkos::atomic_increment(used_hash_sizes + 1); + Kokkos::atomic_add(global_acc_row_vals + trial, my_b_val); + // c_row_vals[trial] = my_b_val; + break; + } + } else { + ++trial; + } + } + } + } + } + } + } + + teamMember.team_barrier(); + + if (tmp != NULL) { + for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size; + my_index += bs) { + nnz_lno_t my_b_col = global_acc_row_keys[my_index]; + if (my_b_col != init_value) { + scalar_t my_b_val = global_acc_row_vals[my_index]; + int fail = 1; + { + nnz_lno_t hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + + // nnz_lno_t max_tries = team_cuckoo_key_size; + nnz_lno_t search_end = + team_cuckoo_key_size; // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size, + // hash + max_tries); + for (nnz_lno_t trial = hash; trial < search_end; ++trial) { + if (keys[trial] == my_b_col) { + vals[trial] += my_b_val; + fail = 0; + break; + } else if (keys[trial] == init_value) { + break; + } + } + search_end = hash; // max_tries - (team_cuckoo_key_size - hash); + + for (nnz_lno_t trial = 0; trial < search_end; ++trial) { + if (keys[trial] == my_b_col) { + vals[trial] += my_b_val; + fail = 0; + break; + } else if (keys[trial] == init_value) { + break; + } + } + } + if (fail) { + nnz_lno_t write_index = 0; + write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, + atomic_incr_type(1)); + c_row[write_index] = my_b_col; + c_row_vals[write_index] = my_b_val; + } + global_acc_row_keys[my_index] = init_value; + } + } + + teamMember.team_barrier(); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + memory_space.release_chunk(global_acc_row_keys); + }); + } + + for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size; + my_index += bs) { + nnz_lno_t my_key = keys[my_index]; + if (my_key != init_value) { + scalar_t my_val = vals[my_index]; + nnz_lno_t write_index = 0; + write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, + atomic_incr_type(1)); + c_row[write_index] = my_key; + c_row_vals[write_index] = my_val; + } + } + } + } + + // In this one row fits into shmem with team-flat-parallel + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag4 &, const team_member_t &teamMember) const { + const nnz_lno_t init_value = -1; + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + + // sizeof(scalar_t)*nvals + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + int thread_rank = teamMember.team_rank(); + + int vector_rank = 0; + typedef typename std::remove_reference::type + atomic_incr_type; + Kokkos::parallel_scan( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &update, const bool final) { + if (final) { + vector_rank = update; + } + update += 1; + }); + int bs = vector_size * suggested_team_size; + int vector_shift = thread_rank * vector_size + vector_rank; + for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; + ++row_index) { + if (row_mapA[row_index] == row_mapA[row_index + 1]) // skip empty A rows + continue; +#if 1 + teamMember.team_barrier(); +#endif + const size_type c_row_begin = rowmapC[row_index]; + // const size_type c_row_end = rowmapC[row_index + 1]; + // const nnz_lno_t c_row_size = c_row_end - c_row_begin; + nnz_lno_t *c_row = entriesC.data() + c_row_begin; + scalar_t *c_row_vals = valuesC.data() + c_row_begin; + + // initialize begins. + { + nnz_lno_t num_threads = + team_cuckoo_key_size / + vector_size; // not needed as team_cuckoo_key_size is always pow2. + // + (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + // nnz_lno_t team_shift = teamind * vector_size; + // nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, + // team_shmem_hash_size - team_shift); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + keys[teamind * vector_size + i] = init_value; + vals[teamind * vector_size + i] = 0; + }); + }); + } + +#if 0 + teamMember.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(teamMember),[&] () { + + for (int i = 0; i < team_shmem_hash_size; ++i){ + if (begins[i] != init_value){ + std::cout << "row_index:" << row_index << " i:" << i << " team_shmem_hash_size:" << team_shmem_hash_size << " is not init_value begins[i]:" << begins[i] << std::endl; + } + } + }); + + teamMember.team_barrier(); +#endif + // initialize hash usage sizes + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; +#if 0 + globally_used_hash_count[0] = 0; +#endif + }); +#if 0 + + teamMember.team_barrier(); +#endif +#if 0 + bool is_global_alloced = false; + nnz_lno_t *globally_used_hash_indices = NULL; +#endif + const size_type a_col_begin_offset = row_mapA[row_index]; + + nnz_lno_t a_col_ind = entriesA[a_col_begin_offset]; + scalar_t a_col_val = valuesA[a_col_begin_offset]; + + nnz_lno_t current_a_column_offset_inrow = 0; + nnz_lno_t flops_on_the_left_of_offsett = 0; + size_type current_b_read_offsett = row_mapB[a_col_ind]; + nnz_lno_t current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + + // nnz_lno_t ii = left_work; + nnz_lno_t row_flops = flops_per_row(row_index); + +#if 1 + teamMember.team_barrier(); +#endif + + for (nnz_lno_t vector_read_shift = vector_shift; + vector_read_shift < row_flops; vector_read_shift += bs) { + { + nnz_lno_t my_b_col_shift = + vector_read_shift - flops_on_the_left_of_offsett; + nnz_lno_t my_b_col = init_value; + scalar_t my_b_val = 0; + nnz_lno_t hash = init_value; + int fail = 0; + + if (my_b_col_shift >= current_a_column_flops) { + do { + ++current_a_column_offset_inrow; + my_b_col_shift -= current_a_column_flops; + flops_on_the_left_of_offsett += current_a_column_flops; + a_col_ind = + entriesA[a_col_begin_offset + current_a_column_offset_inrow]; + + current_b_read_offsett = row_mapB[a_col_ind]; + current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + } while (my_b_col_shift >= current_a_column_flops); + a_col_val = + valuesA[a_col_begin_offset + current_a_column_offset_inrow]; + } + + my_b_col = entriesB[my_b_col_shift + current_b_read_offsett]; + + my_b_val = + valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val; + + // now insert it to first level hashmap accumulator. + hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + fail = 1; + + for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) { + if (keys[trial] == my_b_col) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + for (nnz_lno_t trial = 0; trial < hash;) { + if (keys[trial] == my_b_col) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + Kokkos::atomic_add(vals + trial, my_b_val); + fail = 0; + break; + } + } else { + ++trial; + } + } + } + } + } + + teamMember.team_barrier(); + for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size; + my_index += bs) { + nnz_lno_t my_key = keys[my_index]; + if (my_key != init_value) { + scalar_t my_val = vals[my_index]; + nnz_lno_t write_index = + Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1)); + c_row[write_index] = my_key; + c_row_vals[write_index] = my_val; + } + } + } + } + + size_t team_shmem_size(int /* team_size */) const { + return shared_memory_size; + } +}; + +// +// * Notes on KokkosSPGEMM_numeric_hash * +// +// Prior to this routine, KokkosSPGEMM_numeric(...) was called +// +// KokkosSPGEMM_numeric(...) : +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// call KokkosSPGEMM_numeric_speed(...) +// else: +// call KokkosSPGEMM_numeric_hash(...) (this code!) +// +// * NOTE: KokkosSPGEMM_numeric_hash2(...) is not called +// +// +// KokkosSPGEMM_numeric_hash: +// +// Algorithm selection may be modified as follows +// +// algorithm_to_run: initialized to spgemm_algorithm input to +// KokkosSPGEMM_numeric_hash +// * spgemm_algorithm CANNOT be SPGEMM_KK_SPEED or SPGEMM_KK_DENSE +// +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// if Cuda enabled : +// 1. perform shmem-size + partition computations (used by +// HashMapAccumulator) and flop estimate +// 2. from results of 1. select from SPGEMM_KK_MEMORY_SPREADTEAM, +// SPGEMM_KK_MEMORY_BIGSPREADTEAM, SPGEMM_KK_MEMORY +// * Note: These shmem calculations are not passed along to the +// PortableNumericCHASH functor used by kernels +// TODO check the pre-shmem calculations and functor shmem +// calculations consistent - pass shmem values to functor +// else : +// 1. determine if problem is "dense" +// 2. if dense: call "this->KokkosSPGEMM_numeric_speed" +// else : no change from algorithm_to_run; that is algorithm_to_run == +// SPGEMM_KK || SPGEMM_KK_LP +// +// else : +// skip modification of input algorithm +// +// +// +// Algorithm type matching to kernel Tag: +// +// Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp +// +// Cuda algorithm options: +// (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) : gpu_team_policy4_t, +// i.e. GPUTag4 (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) : +// gpu_team_policy6_t, i.e. GPUTag6 (default == SPGEMM_KK_MEMORY) : +// gpu_team_policy_t, i.e. GPUTag +// +// Non-Cuda host algorithm options: +// SPGEMM_KK_LP: +// (algorithm_to_run == SPGEMM_KK_LP + Dynamic) : +// dynamic_multicore_team_policy4_t, i.e. MultiCoreTag4 (algorithm_to_run +// == SPGEMM_KK_LP + Static) : dynamic_multicore_team_policy4_t // +// typo/bug, should be multicore_team_policy4_t? +// else SPGEMM::KKMEM +// kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC" : +// dynamic_multicore_team_policy_t, i.e. MultiCoreTag kernel label: +// "KOKKOSPARSE::SPGEMM::KKMEM::STATIC" : multicore_team_policy_t, i.e. +// MultiCoreTag + +template +template +void KokkosSPGEMM:: + KokkosSPGEMM_numeric_hash( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\tHASH MODE" << std::endl; + } + KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm; + nnz_lno_t brows = row_mapB.extent(0) - 1; + size_type bnnz = valsB.extent(0); + + int suggested_vector_size = + this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + size_t shmem_size_to_use = shmem_size; + + row_lno_persistent_work_view_t flops_per_row = + this->handle->get_spgemm_handle()->row_flops; + size_t original_overall_flops = + this->handle->get_spgemm_handle()->original_overall_flops; + nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz(); + size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz(); + + typedef KokkosKernels::Impl::UniformMemoryPool + pool_memory_space; + nnz_lno_t min_hash_size = 1; + size_t chunksize = 1; + double first_level_cut_off = + this->handle->get_spgemm_handle()->get_first_level_hash_cut_off(); + int hash_scaler = + this->handle->get_spgemm_handle()->get_min_hash_size_scale(); + nnz_lno_t tmp_max_nnz = max_nnz; + + if (hash_scaler == 0) { + tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX( + max_nnz, nnz_lno_t(this->b_col_cnt / this->concurrency + 1)); + } else { + tmp_max_nnz *= hash_scaler; + } + + // How many extra bytes are needed to align a scalar_t after an array of + // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per + // team or per thread depending on algorithm + constexpr size_t scalarAlignPad = + (alignof(scalar_t) > alignof(nnz_lno_t)) + ? (alignof(scalar_t) - alignof(nnz_lno_t)) + : 0; + + // START OF SHARED MEMORY SIZE CALCULATIONS + // NOTE: the values computed here are not actually passed to functors + // requiring shmem, the calculations here are used for algorithm selection + nnz_lno_t unit_memory = + sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t); + nnz_lno_t team_shmem_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + // alignment padding is per-thread for algorithms with per-thread hashmap + nnz_lno_t thread_memory = + ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8; + + nnz_lno_t thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory); + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tinitial PortableNumericCHASH -- thread_memory:" + << thread_memory << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tinitial PortableNumericCHASH -- team_memory:" + << shmem_size_to_use << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size << std::endl; + } + nnz_lno_t thread_shmem_hash_size = 1; + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + nnz_lno_t team_shmem_hash_size = 1; + while (team_shmem_hash_size * 2 <= team_shmem_key_size) { + team_shmem_hash_size = team_shmem_hash_size * 2; + } + // nnz_lno_t team_shared_memory_hash_func = team_shmem_hash_size - 1; + + team_shmem_key_size = + team_shmem_key_size + + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + team_shmem_key_size = (team_shmem_key_size >> 1) << 1; + + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + + // choose parameters + if (this->spgemm_algorithm == SPGEMM_KK || + SPGEMM_KK_LP == this->spgemm_algorithm) { + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + // then chose the best method and parameters. + size_type average_row_nnz = 0; + size_t average_row_flops = 0; + if (this->a_row_cnt > 0) { + average_row_nnz = overall_nnz / this->a_row_cnt; + average_row_flops = original_overall_flops / this->a_row_cnt; + } + int vector_length_max = + KokkosKernels::Impl::kk_get_max_vector_size(); + // if we have very low flops per row, or our maximum number of nnz is + // prett small, then we do row-base algorithm. + if (SPGEMM_KK_LP != this->spgemm_algorithm && + (average_row_nnz < (size_type)vector_length_max || + average_row_flops < 256)) { + algorithm_to_run = SPGEMM_KK_MEMORY; + // if (average_row_nnz / double (thread_shmem_key_size) > 1.5) + while (average_row_nnz > size_type(thread_shmem_key_size) && + suggested_vector_size < vector_length_max) { + suggested_vector_size = suggested_vector_size * 2; + suggested_vector_size = + KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size); + suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8; + thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory); + thread_shmem_hash_size = 1; + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * + sizeof(nnz_lno_t) - + scalarAlignPad) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + } + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << std::endl; + } + } else { + nnz_lno_t tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + sizeof(scalar_t))); + int team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + suggested_vector_size = vector_length_max; + suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM; + while (average_row_nnz < + team_cuckoo_key_size / 2 * + (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) { + shmem_size_to_use = shmem_size_to_use / 2; + tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + sizeof(scalar_t))); + team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + + suggested_team_size = suggested_team_size / 2; + } + if (average_row_flops > + size_t(2) * suggested_team_size * suggested_vector_size && + average_row_nnz > + size_type(team_cuckoo_key_size) * + (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) { + shmem_size_to_use = shmem_size_to_use * 2; + tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + sizeof(scalar_t))); + team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + suggested_team_size = suggested_team_size * 2; + } +#ifdef FIRSTPARAMS + suggested_team_size = KOKKOSKERNELS_MACRO_MAX(4, suggested_team_size); +#else + suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size); +#endif + if (max_nnz < + team_cuckoo_key_size * + KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.20, 1)) { + algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM; + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with " + "suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << " shmem_size_to_use:" << shmem_size_to_use + << std::endl; + } + } else { + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with " + "suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << " shmem_size_to_use:" << shmem_size_to_use + << std::endl; + } + } + } + } else { + bool run_dense = false; + nnz_lno_t max_column_cut_off = + this->handle->get_spgemm_handle()->MaxColDenseAcc; + nnz_lno_t col_size = this->b_col_cnt; + if (col_size < max_column_cut_off) { + run_dense = true; + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size + << " max_column_cut_off:" << max_column_cut_off + << std::endl; + } + } else { + // round up maxNumRoughNonzeros to closest power of 2. + nnz_lno_t tmp_min_hash_size = 1; + while (tmp_max_nnz > tmp_min_hash_size) { + tmp_min_hash_size *= 4; + } + + size_t kkmem_chunksize = + tmp_min_hash_size; // this is for used hash indices + kkmem_chunksize += tmp_min_hash_size; // this is for the hash begins + kkmem_chunksize += max_nnz; // this is for hash nexts + kkmem_chunksize = kkmem_chunksize * sizeof(nnz_lno_t) + scalarAlignPad; + size_t dense_chunksize = + (col_size + col_size / sizeof(scalar_t) + 1) * sizeof(scalar_t); + + if (kkmem_chunksize >= dense_chunksize * 0.5) { + run_dense = true; + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" + << kkmem_chunksize + << " dense_chunksize:" << dense_chunksize << std::endl; + } + } else { + run_dense = false; + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size + << " max_column_cut_off:" << max_column_cut_off + << std::endl; + } + } + } + + if (run_dense) { + this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + lcl_my_exec_space); + return; + } + } + } + nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( + suggested_team_size, concurrency, a_row_cnt); + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" + << thread_shmem_hash_size + << " thread_shmem_key_size:" << thread_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" + << team_shmem_hash_size + << " team_shmem_key_size:" << team_shmem_key_size << std::endl; + } + // END OF SHARED MEMORY SIZE CALCULATIONS + + // required memory for L2 + if (KokkosKernels::Impl::kk_is_gpu_exec_space< + typename HandleType::HandleExecSpace>()) { + if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { + tmp_max_nnz = 1; + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGTEAM || + algorithm_to_run == SPGEMM_KK_MEMORY_TEAM) { + // tmp_max_nnz -= team_shmem_key_size; + } else { + // tmp_max_nnz -= thread_shmem_key_size; + } + } + + // START SIZE CALCULATIONS FOR MEMORYPOOL + if (algorithm_to_run == SPGEMM_KK_LP) { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 4; + } + chunksize = min_hash_size; // this is for used hash keys + chunksize += max_nnz; // this is for used hash keys + chunksize += scalarAlignPad; // for padding betwen keys and values + chunksize += min_hash_size * sizeof(scalar_t) / + sizeof(nnz_lno_t); // this is for the hash values + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 2; // try to keep it as low as possible because hashes + // are not tracked. + } + chunksize = min_hash_size; // this is for used hash keys + chunksize += scalarAlignPad; // for padding between keys and values + chunksize += min_hash_size * sizeof(scalar_t) / + sizeof(nnz_lno_t); // this is for the hash values + } else { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 4; + } + chunksize = min_hash_size; // this is for used hash indices + chunksize += min_hash_size; // this is for the hash begins + chunksize += max_nnz; // this is for hash nexts + } + + nnz_lno_t num_chunks = + this->template compute_num_pool_chunks( + chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + + // END SIZE CALCULATIONS FOR MEMORYPOOL + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize + << " min_hash_size:" << min_hash_size + << " concurrency:" << concurrency + << " MyExecSpace::concurrency():" << MyExecSpace::concurrency() + << " numchunks:" << num_chunks << std::endl; + } + + KokkosKernels::Impl::PoolType my_pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; + } + + Kokkos::Timer timer1; + pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + m_space.print_memory_pool(); + std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; + std::cout << "\t\tPool Size(MB):" + << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024. + << std::endl; + } + + PortableNumericCHASH< + const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t, + const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, + c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space> + sc(a_row_cnt, row_mapA, entriesA, valsA, + + row_mapB, entriesB, valsB, + + rowmapC_, entriesC_, valuesC_, shmem_size_to_use, + suggested_vector_size, m_space, min_hash_size, max_nnz, + suggested_team_size, + + lcl_my_exec_space, team_row_chunk_size, first_level_cut_off, + flops_per_row, KOKKOSKERNELS_VERBOSE); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tvector_size:" << suggested_vector_size + << " chunk_size:" << team_row_chunk_size + << " suggested_team_size:" << suggested_team_size << std::endl; + } + timer1.reset(); + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { + if (thread_shmem_key_size <= 0) { + std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + "Insufficient shmem available for key for hash map " + "accumulator - Terminating" + << std::endl; + std::cout << " thread_shmem_key_size = " << thread_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + "Insufficient shmem available for key for hash map accumulator "); + } + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM", + gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + MyExecSpace().fence(); + + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + if (thread_shmem_key_size <= 0) { + std::cout << "KokkosSPGEMM_numeric_hash " + "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem " + "available for key for hash map accumulator - Terminating" + << std::endl; + std::cout << " thread_shmem_key_size = " << thread_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: " + "Insufficient shmem available for key for hash map accumulator "); + } + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM", + gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + if (team_shmem_key_size <= 0) { + std::cout + << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " + "available for key for hash map accumulator - Terminating" + << std::endl; + std::cout << " team_shmem_key_size = " << team_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " + "available for key for hash map accumulator "); + } + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY", + gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + MyExecSpace().fence(); + } else { + if (algorithm_to_run == SPGEMM_KK_LP) { + if (use_dynamic_schedule) { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC", + dynamic_multicore_team_policy4_t( + a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC", + multicore_team_policy4_t( + a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + } else { + if (use_dynamic_schedule) { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC", + dynamic_multicore_team_policy_t( + a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::KKMEM::STATIC", + multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + } + MyExecSpace().fence(); + } + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + } +} + +// 01/30/2020: this code seems to be unused within any of the kokkos-kernels +// spgemm numeric phase algorithms +// TODO determine if this code should be revived for use or removed +// this is to isolate the memory use of accumulators and A,B,C. +// normally accumulators can use memory of C directly, but in this one we +// separate it for experimenting. +template +template +void KokkosSPGEMM:: + KokkosSPGEMM_numeric_hash2( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space_) { + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\tHASH MODE" << std::endl; + } + + nnz_lno_t brows = row_mapB.extent(0) - 1; + size_type bnnz = valsB.extent(0); + + int suggested_vector_size = + this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( + suggested_team_size, concurrency, a_row_cnt); + + typedef KokkosKernels::Impl::UniformMemoryPool + pool_memory_space; + + nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz(); + nnz_lno_t min_hash_size = 1; + while (max_nnz > min_hash_size) { + min_hash_size *= 4; + } + + size_t chunksize = min_hash_size; // this is for used hash indices + chunksize += min_hash_size; // this is for the hash begins + chunksize += max_nnz; // this is for hash nexts + chunksize += max_nnz; // this is for indices + chunksize += + max_nnz * (sizeof(scalar_t) / sizeof(nnz_lno_t)); // this is for values + int num_chunks = concurrency / suggested_vector_size; + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize + << " numchunks:" << num_chunks << std::endl; + } + + KokkosKernels::Impl::PoolType my_pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; + } + + Kokkos::Timer timer1; + pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; + std::cout << "\t\tPool Size(MB):" + << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024. + << std::endl; + } + double first_level_cut_off = + this->handle->get_spgemm_handle()->get_first_level_hash_cut_off(); + + PortableNumericCHASH< + const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t, + const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, + c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space> + sc(a_row_cnt, row_mapA, entriesA, valsA, + + row_mapB, entriesB, valsB, + + rowmapC_, entriesC_, valuesC_, shmem_size, suggested_vector_size, + m_space, min_hash_size, max_nnz, suggested_team_size, + + my_exec_space_, team_row_chunk_size, first_level_cut_off, + this->handle->get_spgemm_handle()->row_flops, KOKKOSKERNELS_VERBOSE); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tvector_size:" << suggested_vector_size + << " chunk_size:" << team_row_chunk_size << std::endl; + } + timer1.reset(); + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", + gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + MyExecSpace().fence(); + } else { + if (use_dynamic_schedule) { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_DYNAMIC", + dynamic_multicore_team_policy2_t( + a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_STATIC", + multicore_team_policy2_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + MyExecSpace().fence(); + } + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + } +} + +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp new file mode 100644 index 0000000000..ce3501c447 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp @@ -0,0 +1,234 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_SPGEMM_DEBUG_HPP_ +#define KOKKOSSPARSE_SPGEMM_DEBUG_HPP_ +#include "KokkosKernels_helpers.hpp" +namespace KokkosSparse { + +namespace Impl { + +template +void spgemm_debug_symbolic(KernelHandle *handle, + typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t /* n */, + typename KernelHandle::nnz_lno_t k, + alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + + bool /* transposeA */, blno_row_view_t_ row_mapB, + blno_nnz_view_t_ entriesB, bool /* transposeB */, + clno_row_view_t_ row_mapC) { + typename alno_row_view_t_::HostMirror h_rma = + Kokkos::create_mirror_view(row_mapA); + Kokkos::deep_copy(h_rma, row_mapA); + typename alno_nnz_view_t_::HostMirror h_enta = + Kokkos::create_mirror_view(entriesA); + Kokkos::deep_copy(h_enta, entriesA); + + typename blno_row_view_t_::HostMirror h_rmb = + Kokkos::create_mirror_view(row_mapB); + Kokkos::deep_copy(h_rmb, row_mapB); + typename blno_nnz_view_t_::HostMirror h_entb = + Kokkos::create_mirror_view(entriesB); + Kokkos::deep_copy(h_entb, entriesB); + typename clno_row_view_t_::HostMirror h_rmc = + Kokkos::create_mirror_view(row_mapC); + Kokkos::fence(); + + typedef typename KernelHandle::nnz_lno_t lno_t; + typedef typename KernelHandle::size_type size_type; + // typedef typename KernelHandle::nnz_scalar_t scalar_t; + + std::vector acc_flag(k, false); + + std::vector result_c_col_indices(k); + + size_type result_index = 0; + + h_rmc(0) = 0; + for (lno_t i = 0; i < m; ++i) { + const size_type a_row_begin = h_rma(i); + const size_type a_row_end = h_rma(i + 1); + lno_t a_row_size = a_row_end - a_row_begin; + lno_t row_size = 0; + + for (lno_t j = 0; j < a_row_size; ++j) { + size_type ind = a_row_begin + j; + lno_t col = h_enta(ind); + // scalar_t val = h_vala(ind); + + const size_type b_row_begin = h_rmb(col); + const size_type b_row_end = h_rmb(col + 1); + lno_t b_row_size = b_row_end - b_row_begin; + for (lno_t z = 0; z < b_row_size; ++z) { + size_type ind_ = b_row_begin + z; + lno_t b_col = h_entb(ind_); + // scalar_t b_val = h_valb(ind_); + // if (i == 0) std::cout << "\tb col:" << b_col << std::endl; + if (acc_flag[b_col] == false) { + acc_flag[b_col] = true; + result_c_col_indices[row_size++] = b_col; + } + } + } + result_index += row_size; + h_rmc(i + 1) = result_index; + // size_type c_row_begin = h_rmc(i); + + // if (i == 0) std::cout << "result_cols" << std::endl; + + for (lno_t j = 0; j < row_size; ++j) { + lno_t result_col = result_c_col_indices[j]; + acc_flag[result_col] = false; + } + } + + handle->get_spgemm_handle()->set_c_nnz(result_index); + Kokkos::deep_copy(row_mapC, h_rmc); + Kokkos::fence(); +} + +template +void spgemm_debug_numeric(KernelHandle * /* handle */, + typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t /* n */, + typename KernelHandle::nnz_lno_t k, + alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + ascalar_nnz_view_t_ valuesA, + + bool /* transposeA */, blno_row_view_t_ row_mapB, + blno_nnz_view_t_ entriesB, + bscalar_nnz_view_t_ valuesB, bool /* transposeB */, + clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC, + cscalar_nnz_view_t_ valuesC) { + typename alno_row_view_t_::HostMirror h_rma = + Kokkos::create_mirror_view(row_mapA); + Kokkos::deep_copy(h_rma, row_mapA); + typename alno_nnz_view_t_::HostMirror h_enta = + Kokkos::create_mirror_view(entriesA); + Kokkos::deep_copy(h_enta, entriesA); + typename ascalar_nnz_view_t_::HostMirror h_vala = + Kokkos::create_mirror_view(valuesA); + Kokkos::deep_copy(h_vala, valuesA); + + typename blno_row_view_t_::HostMirror h_rmb = + Kokkos::create_mirror_view(row_mapB); + Kokkos::deep_copy(h_rmb, row_mapB); + typename blno_nnz_view_t_::HostMirror h_entb = + Kokkos::create_mirror_view(entriesB); + Kokkos::deep_copy(h_entb, entriesB); + typename bscalar_nnz_view_t_::HostMirror h_valb = + Kokkos::create_mirror_view(valuesB); + Kokkos::deep_copy(h_valb, valuesB); + typename clno_row_view_t_::HostMirror h_rmc = + Kokkos::create_mirror_view(row_mapC); + Kokkos::deep_copy(h_rmc, row_mapC); + + typename clno_nnz_view_t_::HostMirror h_entc = + Kokkos::create_mirror_view(entriesC); + typename cscalar_nnz_view_t_::HostMirror h_valc = + Kokkos::create_mirror_view(valuesC); + Kokkos::fence(); + + typedef typename KernelHandle::nnz_lno_t lno_t; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_scalar_t scalar_t; + + std::vector accumulator(k, 0); + std::vector acc_flag(k, false); + + h_rmc(0) = 0; + for (lno_t i = 0; i < m; ++i) { + const size_type a_row_begin = h_rma(i); + const size_type a_row_end = h_rma(i + 1); + lno_t a_row_size = a_row_end - a_row_begin; + + size_type c_row_begin = h_rmc(i); + lno_t c_row_size = h_rmc(i + 1) - c_row_begin; + lno_t c_row_size_counter = 0; + + for (lno_t j = 0; j < a_row_size; ++j) { + size_type ind = a_row_begin + j; + lno_t col = h_enta(ind); + scalar_t val = h_vala(ind); + const size_type b_row_begin = h_rmb(col); + const size_type b_row_end = h_rmb(col + 1); + lno_t b_row_size = b_row_end - b_row_begin; + for (lno_t z = 0; z < b_row_size; ++z) { + size_type ind_ = b_row_begin + z; + lno_t b_col = h_entb(ind_); + scalar_t b_val = h_valb(ind_); + + if (acc_flag[b_col] == false) { + acc_flag[b_col] = true; + h_entc(c_row_begin + c_row_size_counter++) = b_col; + } + accumulator[b_col] += b_val * val; + } + } + + // if (i == 0) std::cout << "result_cols" << std::endl; + + for (lno_t j = 0; j < c_row_size; ++j) { + size_type ind = c_row_begin + j; + lno_t result_col = h_entc(ind); + h_valc(ind) = accumulator[result_col]; + accumulator[result_col] = 0; + acc_flag[result_col] = false; + } + } + + Kokkos::deep_copy(entriesC, h_entc); + Kokkos::deep_copy(valuesC, h_valc); + Kokkos::fence(); +} + +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp new file mode 100644 index 0000000000..bc185c0cd1 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -0,0 +1,637 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_Utils.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +template +struct KokkosSPGEMM::NumericCMEM_CPU { + nnz_lno_t numrows; + nnz_lno_t numcols; + + a_row_view_t row_mapA; + a_nnz_view_t entriesA; + a_scalar_view_t valuesA; + + b_row_view_t row_mapB; + b_nnz_view_t entriesB; + b_scalar_view_t valuesB; + + c_row_view_t rowmapC; + c_nnz_view_t entriesC; + c_scalar_view_t valuesC; + mpool_type memory_space; + + nnz_lno_t *pEntriesC; + scalar_t *pVals; + const KokkosKernels::Impl::ExecSpaceType my_exec_space; + const nnz_lno_t team_work_size; + + NumericCMEM_CPU(nnz_lno_t m_, nnz_lno_t k_, a_row_view_t row_mapA_, + a_nnz_view_t entriesA_, a_scalar_view_t valuesA_, + + b_row_view_t row_mapB_, b_nnz_view_t entriesB_, + b_scalar_view_t valuesB_, + + c_row_view_t rowmapC_, c_nnz_view_t entriesC_, + c_scalar_view_t valuesC_, mpool_type memory_space_, + const KokkosKernels::Impl::ExecSpaceType my_exec_space_, + nnz_lno_t team_row_chunk_size) + : numrows(m_), + numcols(k_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + memory_space(memory_space_), + pEntriesC(entriesC_.data()), + pVals(valuesC.data()), + my_exec_space(my_exec_space_), + team_work_size(team_row_chunk_size) {} + + KOKKOS_INLINE_FUNCTION + size_t get_thread_id(const size_t row_index) const { + switch (my_exec_space) { + default: return row_index; +#if defined(KOKKOS_ENABLE_SERIAL) + case KokkosKernels::Impl::Exec_SERIAL: return 0; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + case KokkosKernels::Impl::Exec_OMP: + return Kokkos::OpenMP::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + case KokkosKernels::Impl::Exec_THREADS: + return Kokkos::Threads::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_CUDA) + case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined(KOKKOS_ENABLE_HIP) + case KokkosKernels::Impl::Exec_HIP: return row_index; +#endif + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + scalar_t *dense_accum = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (dense_accum == NULL) { + dense_accum = (scalar_t *)(memory_space.allocate_chunk(tid)); + } + char *marker = (char *)(dense_accum + numcols); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + nnz_lno_t *myentries = pEntriesC + c_row_begin; + scalar_t *myvals = pVals + c_row_begin; + + nnz_lno_t current_col_index = 0; + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t nnza = nnz_lno_t(row_mapA[row_index + 1] - col_begin); + + for (nnz_lno_t colind = 0; colind < nnza; ++colind) { + size_type a_col = colind + col_begin; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work = row_mapB(rowB + 1) - rowBegin; + for (int i = 0; i < left_work; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + scalar_t b_val = valuesB[adjind] * valA; + if (marker[b_col_ind] == 0) { + marker[b_col_ind] = 1; + myentries[current_col_index++] = b_col_ind; + } + dense_accum[b_col_ind] += b_val; + } + } + for (nnz_lno_t i = 0; i < current_col_index; ++i) { + nnz_lno_t ind = myentries[i]; + myvals[i] = dense_accum[ind]; + dense_accum[ind] = 0; + marker[ind] = 0; + } + }); + memory_space.release_chunk(dense_accum); + } +}; + +template +template + +struct KokkosSPGEMM::NumericCMEM { + nnz_lno_t numrows; + + a_row_view_t__ row_mapA; + a_nnz_view_t__ entriesA; + a_scalar_view_t__ valuesA; + + b_row_view_t__ row_mapB; + b_nnz_view_t__ entriesB; + b_scalar_view_t__ valuesB; + + c_row_view_t__ rowmapC; + c_nnz_view_t__ entriesC; + c_scalar_view_t__ valuesC; + + c_nnz_tmp_view_t beginsC; + c_nnz_tmp_view_t nextsC; + + nnz_lno_t *pbeginsC, *pnextsC, *pEntriesC; + scalar_t *pvaluesC; + + const size_t shared_memory_size; + const int vector_size; + const nnz_lno_t team_work_size; + + const int unit_memory; // begins, nexts, and keys. No need for vals yet. + const int suggested_team_size; + const int thread_memory; + nnz_lno_t shmem_key_size; + nnz_lno_t shared_memory_hash_func; + nnz_lno_t shmem_hash_size; + + NumericCMEM(nnz_lno_t m_, a_row_view_t__ row_mapA_, a_nnz_view_t__ entriesA_, + a_scalar_view_t__ valuesA_, + + b_row_view_t__ row_mapB_, b_nnz_view_t__ entriesB_, + b_scalar_view_t__ valuesB_, + + c_row_view_t__ rowmapC_, c_nnz_view_t__ entriesC_, + c_scalar_view_t__ valuesC_, + + c_nnz_tmp_view_t beginsC_, c_nnz_tmp_view_t nextsC_, + + const size_type sharedMemorySize_, + const int suggested_vector_size, + const nnz_lno_t team_row_chunk_size, int suggested_team_size_, + bool KOKKOSKERNELS_VERBOSE_) + : numrows(m_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + beginsC(beginsC_), + nextsC(nextsC_), + pbeginsC(beginsC_.data()), + pnextsC(nextsC_.data()), + pEntriesC(entriesC_.data()), + pvaluesC(valuesC_.data()), + shared_memory_size(sharedMemorySize_), + + vector_size(suggested_vector_size), + team_work_size(team_row_chunk_size), + + unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + + sizeof(scalar_t)), + suggested_team_size(suggested_team_size_), + thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8), + shmem_key_size(), + shared_memory_hash_func(), + shmem_hash_size(1) { + constexpr size_t scalarAlignPad = + (alignof(scalar_t) > alignof(nnz_lno_t)) + ? (alignof(scalar_t) - alignof(nnz_lno_t)) + : 0; + shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + unit_memory); + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tNumericCMEM -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << shmem_key_size << std::endl; + } + while (shmem_hash_size * 2 <= shmem_key_size) { + shmem_hash_size = shmem_hash_size * 2; + } + shared_memory_hash_func = shmem_hash_size - 1; + + shmem_key_size = shmem_key_size + + ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + shmem_key_size = (shmem_key_size >> 1) << 1; + + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tNumericCMEM -- adjusted hashsize:" << shmem_hash_size + << " shmem_key_size:" << shmem_key_size << std::endl; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag &, const team_member_t &teamMember) const { + // get the beginning and end rows of the team. + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shift it to the thread private part + all_shared_memory += thread_memory * teamMember.team_rank(); + + // used_hash_sizes hold the size of 1st and 2nd level hashes + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; + + // poins to the next elements + nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); + + // issue-508, TODO: understand and re-work below parallel_for loop. + // Inialize hm2 with correct max_value_size and hashOpRHS + // global_memory_hash_size is computed, per team of threads -- this is + // hashOpRHS. + + KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::modulo> + hm2(0, 0, NULL, NULL, NULL, NULL); + /* + KokkosKernels::Experimental::HashmapAccumulator + hm2(global_memory_hash_size, global_memory_hash_size, + pbeginsC + c_row_begin, pnextsC + c_row_begin, pEntriesC + c_row_begin, + pvaluesC + c_row_begin); + */ + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + const nnz_lno_t global_memory_hash_size = + nnz_lno_t(rowmapC[row_index + 1] - c_row_begin); + + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin; + hm2.hash_begins = pbeginsC + c_row_begin; + hm2.hash_nexts = pnextsC + c_row_begin; + + // initialize begins. + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, shmem_hash_size), + [&](int i) { begins[i] = -1; }); + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + }); + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = + nnz_lno_t(row_mapA[row_index + 1] - col_begin); + + for (nnz_lno_t colind = 0; colind < left_work; ++colind) { + size_type a_col = colind + col_begin; + nnz_lno_t rowB = entriesA[a_col]; + scalar_t valA = valuesA[a_col]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin; + + while (left_work_) { + nnz_lno_t work_to_handle = + KOKKOSKERNELS_MACRO_MIN(vector_size, left_work_); + nnz_lno_t b_col_ind = -1; + scalar_t b_val = -1; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, work_to_handle), + [&](nnz_lno_t i) { + const size_type adjind = i + rowBegin; + b_col_ind = entriesB[adjind]; + b_val = valuesB[adjind] * valA; + }); + + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd( + b_col_ind, b_val, used_hash_sizes); + + int overall_num_unsuccess = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &overall_num_unsuccess_) { + overall_num_unsuccess_ += num_unsuccess; + }, + overall_num_unsuccess); + + if (overall_num_unsuccess) { + nnz_lno_t hash_ = -1; + if (num_unsuccess) { + hash_ = b_col_ind % global_memory_hash_size; + } + + // int insertion = + hm2.vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + teamMember, vector_size, hash_, b_col_ind, b_val, + used_hash_sizes + 1, global_memory_hash_size); + } + left_work_ -= work_to_handle; + rowBegin += work_to_handle; + } + } + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + if (used_hash_sizes[0] > shmem_key_size) + used_hash_sizes[0] = shmem_key_size; + }); + + size_type num_elements = used_hash_sizes[0]; + + size_type written_index = used_hash_sizes[1]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, num_elements), + [&](size_type i) { + pEntriesC[c_row_begin + written_index + i] = keys[i]; + pvaluesC[c_row_begin + written_index + i] = vals[i]; + }); + }); + } + + size_t team_shmem_size(int /* team_size */) const { + return shared_memory_size; + } +}; + +// +// * Notes on KokkosSPGEMM_numeric_speed * +// +// Prior to this routine, KokkosSPGEMM_numeric(...) was called +// +// KokkosSPGEMM_numeric(...) : +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// call KokkosSPGEMM_numeric_speed(...) +// else: +// call KokkosSPGEMM_numeric_hash(...) +// +// +// KokkosSPGEMM_numeric_speed: +// +// Algorithm selection as follows and matching to kernel Tag: +// +// Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp +// +// if GPU: +// "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t, i.e. +// GPUTag +// +// else : +// "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC" : +// dynamic_multicore_team_policy_t, i.e. MultiCoreTag +// "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC" : multicore_team_policy_t, +// i.e. MultiCoreTag +// + +template +template +void KokkosSPGEMM:: + KokkosSPGEMM_numeric_speed( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space_) { + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\tSPEED MODE" << std::endl; + } + + nnz_lno_t brows = row_mapB.extent(0) - 1; + size_type bnnz = valsB.extent(0); + + // get suggested vector size, teamsize and row chunk size. + int suggested_vector_size = + this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( + suggested_team_size, concurrency, a_row_cnt); + + Kokkos::Timer numeric_speed_timer_with_free; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space< + typename HandleType::HandleExecSpace>()) { + // allocate memory for begins and next to be used by the hashmap + nnz_lno_temp_work_view_t beginsC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), + valuesC_.extent(0)); + nnz_lno_temp_work_view_t nextsC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), + valuesC_.extent(0)); + Kokkos::deep_copy(beginsC, -1); + + // create the functor. + NumericCMEM + sc(a_row_cnt, row_mapA, entriesA, valsA, + + row_mapB, entriesB, valsB, + + rowmapC_, entriesC_, valuesC_, + + beginsC, nextsC, shmem_size, suggested_vector_size, + team_row_chunk_size, suggested_team_size, KOKKOSKERNELS_VERBOSE); + + Kokkos::Timer timer1; + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tGPU vector_size:" << suggested_vector_size + << " team_size:" << suggested_team_size + << " chunk_size:" << team_row_chunk_size << std::endl; + } + + timer1.reset(); + // this is basically kkmem without memory pools. + // only executed for to check the effect of memory pools. + Kokkos::parallel_for( + "KokkosSparse::NumericCMEM::KKSPEED::GPU", + gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + } + } else { + Kokkos::Timer numeric_speed_timer; + typedef KokkosKernels::Impl::UniformMemoryPool + pool_memory_space; + + KokkosKernels::Impl::PoolType my_pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + int num_chunks = concurrency; + + Kokkos::Timer timer1; + pool_memory_space m_space( + num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1, + 0, my_pool_type); + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; + std::cout << "\tPool Size(MB):" + << sizeof(scalar_t) * + (num_chunks * + (this->b_col_cnt + + (this->b_col_cnt) / sizeof(scalar_t) + 1)) / + 1024. / 1024. + << std::endl; + } + + NumericCMEM_CPU + sc(a_row_cnt, b_col_cnt, row_mapA, entriesA, valsA, + + row_mapB, entriesB, valsB, + + rowmapC_, entriesC_, valuesC_, m_space, my_exec_space_, + team_row_chunk_size); + + MyExecSpace().fence(); + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tCPU vector_size:" << suggested_vector_size + << " team_size:" << suggested_team_size + << " chunk_size:" << team_row_chunk_size << std::endl; + } + timer1.reset(); + + if (use_dynamic_schedule) { + Kokkos::parallel_for("KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC", + dynamic_multicore_team_policy_t( + a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for( + "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC", + multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + + MyExecSpace().fence(); + + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds() + << std::endl; + } + } + if (KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric SPEED TIME WITH FREE:" + << numeric_speed_timer_with_free.seconds() << std::endl; + } +} +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp new file mode 100644 index 0000000000..0b28d2f02b --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -0,0 +1,436 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPGEMM_NUMERIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPGEMM_NUMERIC_SPEC_HPP_ + +#include + +#include +//#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +//#include "KokkosSparse_spgemm_symbolic.hpp" +#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp" +#include "KokkosSparse_spgemm_CUSP_impl.hpp" +#include "KokkosSparse_spgemm_impl.hpp" +#include "KokkosSparse_spgemm_impl_seq.hpp" +#include "KokkosSparse_spgemm_mkl_impl.hpp" +#include "KokkosSparse_spgemm_mkl2phase_impl.hpp" +#include "KokkosSparse_spgemm_viennaCL_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spgemm_numeric_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template <> \ + struct spgemm_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; \ + \ + template <> \ + struct spgemm_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosBlas::spgemm (sparse matrix - dense +/// vector multiply) for multiple vectors at a time (multivectors) +/// and possibly multiple coefficients at a time. + +template ::value, + bool eti_spec_avail = spgemm_numeric_eti_spec_avail< + KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, + b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_, + c_lno_view_t, c_scalar_view_t>::value> +struct SPGEMM_NUMERIC { + static void spgemm_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + typename KernelHandle::const_nnz_lno_t k, + a_size_view_t_ row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + + bool transposeA, b_size_view_t_ row_mapB, + b_lno_view_t entriesB, b_scalar_view_t valuesB, + bool transposeB, c_size_view_t_ row_mapC, + c_lno_view_t &entriesC, c_scalar_view_t &valuesC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +//! Full specialization of spgemm_mv for single vectors (2-D Views). +// Unification layer +template +struct SPGEMM_NUMERIC< + KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_, + b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t, + c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { + static void spgemm_numeric( + KernelHandle *handle, typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, + a_size_view_t_ row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, + + bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC, + c_lno_view_t &entriesC, c_scalar_view_t &valuesC) { + typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType; + spgemmHandleType *sh = handle->get_spgemm_handle(); + if (!sh->is_symbolic_called()) { + throw std::runtime_error( + "Call spgemm symbolic before calling SpGEMM numeric"); + /* + KokkosSparse::Experimental::spgemm_symbolic( + handle, m, n, k, + row_mapA, entriesA, transposeA, + row_mapB, entriesB, transposeB, + row_mapC + ); + typename c_size_view_t_::value_type c_nnz_size = + handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC = + c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); valuesC = c_scalar_view_t + (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); + } + */ + } + + switch (sh->get_algorithm_type()) { + case SPGEMM_CUSPARSE: + cuSPARSE_apply( + sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, + entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC); + break; + case SPGEMM_CUSP: + CUSP_apply(sh, m, n, k, row_mapA, entriesA, valuesA, + transposeA, row_mapB, entriesB, valuesB, + transposeB, row_mapC, entriesC, valuesC); + break; + case SPGEMM_MKL: +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, + valuesC, handle->get_verbose()); +#else + throw std::runtime_error("MKL was not enabled in this build!"); +#endif + break; + case SPGEMM_MKL2PHASE: + mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, + entriesC, valuesC, handle->get_verbose()); + break; + + case SPGEMM_VIENNA: + viennaCL_apply( + sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, + entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC, + handle->get_verbose()); + break; + + default: + + { + KokkosSPGEMM + kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB); + kspgemm.KokkosSPGEMM_numeric(row_mapC, entriesC, valuesC); + } break; + case SPGEMM_SERIAL: + case SPGEMM_DEBUG: + spgemm_debug_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, + + transposeA, row_mapB, entriesB, valuesB, + transposeB, row_mapC, entriesC, valuesC); + break; + } + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + extern template struct SPGEMM_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; \ + \ + extern template struct SPGEMM_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; \ + \ + template struct SPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include +#include + +#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp new file mode 100644 index 0000000000..47b06b716a --- /dev/null +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -0,0 +1,459 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosKernels_Sorting.hpp" +#include +#include +#include + +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include +#include + +#include + +// This file contains the matrix for test_issue402 +#include "matrixIssue402.hpp" + +// const char *input_filename = "sherman1.mtx"; +// const char *input_filename = "Si2.mtx"; +// const char *input_filename = "wathen_30_30.mtx"; +// const size_t expected_num_cols = 9906; +using namespace KokkosSparse; +using namespace KokkosSparse::Experimental; +using namespace KokkosKernels; +using namespace KokkosKernels::Experimental; + +// #ifndef kokkos_complex_double +// #define kokkos_complex_double Kokkos::complex +// #define kokkos_complex_float Kokkos::complex +// #endif + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; + +namespace Test { + +template +int run_spgemm(crsMat_t A, crsMat_t B, + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C) { + typedef typename crsMat_t::size_type size_type; + typedef typename crsMat_t::ordinal_type lno_t; + typedef typename crsMat_t::value_type scalar_t; + + typedef KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space> + KernelHandle; + + KernelHandle kh; + kh.set_team_work_size(16); + kh.set_dynamic_scheduling(true); + + kh.create_spgemm_handle(spgemm_algorithm); + + KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C); + KokkosSparse::spgemm_numeric(kh, A, false, B, false, C); + kh.destroy_spgemm_handle(); + + return 0; +} + +template +int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2, + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, + crsMat_t &result) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + typedef typename lno_view_t::value_type size_type; + typedef typename lno_nnz_view_t::value_type lno_t; + typedef typename scalar_view_t::value_type scalar_t; + + typedef KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + typename device::memory_space, typename device::memory_space> + KernelHandle; + + KernelHandle kh; + kh.set_team_work_size(16); + kh.set_dynamic_scheduling(true); + // kh.set_verbose(true); + + kh.create_spgemm_handle(spgemm_algorithm); + + const size_t num_rows_1 = input_mat.numRows(); + const size_t num_rows_2 = input_mat2.numRows(); + const size_t num_cols_2 = input_mat2.numCols(); + + const size_t num_cols_1 = input_mat.numCols(); + bool equal = num_rows_2 == num_cols_1; + if (!equal) return 1; + + lno_view_t row_mapC("non_const_lnow_row", num_rows_1 + 1); + lno_nnz_view_t entriesC; + scalar_view_t valuesC; + + spgemm_symbolic(&kh, num_rows_1, num_rows_2, num_cols_2, + input_mat.graph.row_map, input_mat.graph.entries, false, + input_mat2.graph.row_map, input_mat2.graph.entries, false, + row_mapC); + + size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + entriesC = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size); + valuesC = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); + spgemm_numeric(&kh, num_rows_1, num_rows_2, num_cols_2, + input_mat.graph.row_map, input_mat.graph.entries, + input_mat.values, false, + + input_mat2.graph.row_map, input_mat2.graph.entries, + input_mat2.values, false, row_mapC, entriesC, valuesC); + + graph_t static_graph(entriesC, row_mapC); + result = crsMat_t("CrsMatrix", num_cols_2, valuesC, static_graph); + kh.destroy_spgemm_handle(); + + return 0; +} +template +bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + size_t nrows_actual = output_mat_actual.numRows(); + size_t nentries_actual = output_mat_actual.graph.entries.extent(0); + size_t nvals_actual = output_mat_actual.values.extent(0); + + size_t nrows_reference = output_mat_reference.numRows(); + size_t nentries_reference = output_mat_reference.graph.entries.extent(0); + size_t nvals_reference = output_mat_reference.values.extent(0); + + if (nrows_actual != nrows_reference) { + std::cout << "nrows_actual:" << nrows_actual + << " nrows_reference:" << nrows_reference << std::endl; + return false; + } + if (nentries_actual != nentries_reference) { + std::cout << "nentries_actual:" << nentries_actual + << " nentries_reference:" << nentries_reference << std::endl; + return false; + } + if (nvals_actual != nvals_reference) { + std::cout << "nvals_actual:" << nvals_actual + << " nvals_reference:" << nvals_reference << std::endl; + return false; + } + + KokkosKernels::sort_crs_matrix(output_mat_actual); + KokkosKernels::sort_crs_matrix(output_mat_reference); + + bool is_identical = true; + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + + if (!is_identical) { + std::cout << "rowmaps are different." << std::endl; + std::cout << "Actual rowmap:\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map); + std::cout << "Correct rowmap (SPGEMM_DEBUG):\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + + if (!is_identical) { + std::cout << "entries are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries); + return false; + } + + typedef typename Kokkos::Details::ArithTraits< + typename scalar_view_t::non_const_value_type>::mag_type eps_type; + eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; + + is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< + scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( + output_mat_actual.values, output_mat_reference.values, eps); + + if (!is_identical) { + std::cout << "values are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.values); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.values); + + return false; + } + return true; +} +} // namespace Test + +// Generate matrices and test all supported spgemm algorithms. +// C := AB, where A is m*k, B is k*n, and C is m*n. +template +void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, + lno_t row_size_variance, bool oldInterface = false) { + using namespace Test; + // device::execution_space::initialize(); + // device::execution_space::print_configuration(std::cout); + + typedef CrsMatrix crsMat_t; + // typedef typename crsMat_t::StaticCrsGraphType graph_t; + // typedef typename graph_t::row_map_type::non_const_type lno_view_t; + // typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + // typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + // Generate random compressed sparse row matrix. Randomly generated (non-zero) + // values are stored in a 1-D (1 rank) array. + crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + m, k, nnz, row_size_variance, bandwidth); + crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( + k, n, nnz, row_size_variance, bandwidth); + + const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; + + crsMat_t output_mat2; + if (oldInterface) + run_spgemm_old_interface(A, B, SPGEMM_DEBUG, output_mat2); + else + run_spgemm(A, B, SPGEMM_DEBUG, output_mat2); + + std::vector algorithms = { + SPGEMM_KK, SPGEMM_KK_LP, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, + SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ + }; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + algorithms.push_back(SPGEMM_MKL); +#endif + + for (auto spgemm_algorithm : algorithms) { + const uint64_t max_integer = 2147483647; + std::string algo = "UNKNOWN"; + bool is_expected_to_fail = false; + + switch (spgemm_algorithm) { + case SPGEMM_CUSPARSE: + // TODO: add these test failure cases for cusparse too. + algo = "SPGEMM_CUSPARSE"; +#if !defined(KERNELS_HAVE_CUSPARSE) && \ + !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + is_expected_to_fail = true; +#endif + break; + + case SPGEMM_MKL: algo = "SPGEMM_MKL"; +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { + is_expected_to_fail = true; + } +#endif + // MKL requires local ordinals to be int. + // Note: empty-array special case will NOT fail on this. + if (!std::is_same::value && !is_empy_case) { + is_expected_to_fail = true; + } + // if size_type is larger than int, mkl casts it to int. + // it will fail if casting cause overflow. + if (A.values.extent(0) > max_integer) { + is_expected_to_fail = true; + } + break; + + case SPGEMM_KK: algo = "SPGEMM_KK"; break; + case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break; + case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break; + case SPGEMM_KK_SPEED: algo = "SPGEMM_KK_SPEED"; break; + case SPGEMM_KK_MEMORY: algo = "SPGEMM_KK_MEMORY"; break; + default: algo = "!!! UNKNOWN ALGO !!!"; + } + + Kokkos::Timer timer1; + crsMat_t output_mat; + + bool failed = false; + int res = 0; + try { + if (oldInterface) + res = run_spgemm_old_interface(A, B, spgemm_algorithm, + output_mat); + else + res = run_spgemm(A, B, spgemm_algorithm, output_mat); + } catch (const char *message) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; + failed = true; + } catch (std::string message) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; + failed = true; + } catch (std::exception &e) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what(); + failed = true; + } + EXPECT_EQ(is_expected_to_fail, failed); + + // double spgemm_time = timer1.seconds(); + + timer1.reset(); + if (!is_expected_to_fail) { + EXPECT_TRUE((res == 0)) << algo; + bool is_identical = + is_same_matrix(output_mat, output_mat2); + EXPECT_TRUE(is_identical) << algo; + // EXPECT_TRUE( equal) << algo; + } + // std::cout << "algo:" << algo << " spgemm_time:" << spgemm_time << " + // output_check_time:" << timer1.seconds() << std::endl; + } + // device::execution_space::finalize(); +} + +template +void test_issue402() { + using namespace Test; + typedef CrsMatrix crsMat_t; + + // this specific matrix (from a circuit simulation) reliably replicated issue + // #402 (incorrect/crashing SPGEMM KKMEM) + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + const lno_t numRows = 1813; + const size_type nnz = 11156; + lno_view_t Arowmap("A rowmap", numRows + 1); + lno_nnz_view_t Aentries("A entries", nnz); + scalar_view_t Avalues("A values", nnz); + // Read out the matrix from the header file "matrixIssue402.hpp" + { + auto rowmapHost = Kokkos::create_mirror_view(Arowmap); + auto entriesHost = Kokkos::create_mirror_view(Aentries); + auto valuesHost = Kokkos::create_mirror_view(Avalues); + for (lno_t i = 0; i < numRows + 1; i++) + rowmapHost(i) = MatrixIssue402::rowmap[i]; + for (size_type i = 0; i < nnz; i++) { + entriesHost(i) = MatrixIssue402::entries[i]; + valuesHost(i) = MatrixIssue402::values[i]; + } + Kokkos::deep_copy(Arowmap, rowmapHost); + Kokkos::deep_copy(Aentries, entriesHost); + Kokkos::deep_copy(Avalues, valuesHost); + } + crsMat_t A("A", numRows, numRows, nnz, Avalues, Arowmap, Aentries); + // compute explicit transpose: the bug was replicated by computing AA' + lno_view_t Browmap("B = A^T rowmap", numRows + 1); + lno_nnz_view_t Bentries("B = A^T entries", nnz); + scalar_view_t Bvalues("B = A^T values", nnz); + KokkosKernels::Impl::transpose_matrix< + lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t, + scalar_view_t, lno_view_t, typename device::execution_space>( + numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues); + crsMat_t B("B=A^T", numRows, numRows, nnz, Bvalues, Browmap, Bentries); + crsMat_t Cgold; + run_spgemm(A, B, SPGEMM_DEBUG, Cgold); + crsMat_t C; + bool success = true; + std::string errMsg; + try { + int res = run_spgemm(A, B, SPGEMM_KK_MEMORY, C); + if (res) throw "run_spgemm returned error code"; + } catch (const char *message) { + errMsg = message; + success = false; + } catch (std::string message) { + errMsg = message; + success = false; + } catch (std::exception &e) { + errMsg = e.what(); + success = false; + } + EXPECT_TRUE(success) << "KKMEM still has issue 402 bug! Error message:\n" + << errMsg << '\n'; + bool correctResult = is_same_matrix(C, Cgold); + EXPECT_TRUE(correctResult) + << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n"; +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_spgemm(10000, 10000, 10000, \ + 10000 * 20, 500, 10, false); \ + test_spgemm(10000, 10000, 10000, \ + 10000 * 20, 500, 10, true); \ + test_spgemm(0, 0, 0, 0, 10, 10, false); \ + test_spgemm(0, 0, 0, 0, 10, 10, true); \ + test_spgemm(0, 12, 5, 0, 10, 0, false); \ + test_spgemm(0, 12, 5, 0, 10, 0, true); \ + test_spgemm(10, 10, 0, 0, 10, 10, false); \ + test_spgemm(10, 10, 0, 0, 10, 10, true); \ + test_spgemm(10, 10, 10, 0, 0, 0, false); \ + test_spgemm(10, 10, 10, 0, 0, 0, true); \ + test_issue402(); \ + } + +// test_spgemm(50000, 50000 * 30, 100, 10); +// test_spgemm(50000, 50000 * 30, 200, 10); + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST From 58959c70cefd4549bf99631a2d4a91677b9d2ae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 14:40:33 +0100 Subject: [PATCH 092/261] Unit test for block SpGEMM --- src/common/KokkosKernels_IOUtils.hpp | 26 +++- src/common/KokkosKernels_Sorting.hpp | 71 ++++++++++ unit_test/sparse/Test_Sparse.hpp | 1 + unit_test/sparse/Test_Sparse_bspgemm.hpp | 172 +++++++++++------------ 4 files changed, 176 insertions(+), 94 deletions(-) diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index b0575197b0..d450221797 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -59,6 +59,7 @@ #include #include "Kokkos_Random.hpp" #include "KokkosKernels_SimpleUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" #include namespace KokkosKernels { @@ -94,7 +95,8 @@ template void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, SizeType &nnz, OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, - SizeType *&rowPtr, OrdinalType *&colInd) { + SizeType *&rowPtr, OrdinalType *&colInd, + OrdinalType block_elem_count = 1) { rowPtr = new SizeType[nrows + 1]; OrdinalType elements_per_row = nrows ? nnz / nrows : 0; @@ -138,7 +140,8 @@ void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, } // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 // + 50i) for complex types. - Kokkos::View valuesView(values, nnz); + Kokkos::View valuesView( + values, nnz * block_elem_count); ScalarType randStart, randEnd; getRandomBounds(50.0, randStart, randEnd); Kokkos::Random_XorShift64_Pool pool(13718); @@ -443,6 +446,25 @@ crsMat_t kk_generate_sparse_matrix( return crsmat; } +template +bsrMat_t kk_generate_sparse_matrix( + typename bsrMat_t::const_ordinal_type block_dim, + typename bsrMat_t::const_ordinal_type nrows, + typename bsrMat_t::const_ordinal_type ncols, + typename bsrMat_t::non_const_size_type &nnz, + typename bsrMat_t::const_ordinal_type row_size_variance, + typename bsrMat_t::const_ordinal_type bandwidth) { + typedef KokkosSparse::CrsMatrix< + typename bsrMat_t::value_type, typename bsrMat_t::ordinal_type, + typename bsrMat_t::device_type, typename bsrMat_t::memory_traits, + typename bsrMat_t::size_type> + crsMat_t; + + const auto crs_mtx = kk_generate_sparse_matrix( + nrows * block_dim, ncols * block_dim, nnz, row_size_variance, bandwidth); + bsrMat_t bsrmat(crs_mtx, block_dim); + return bsrmat; +} // TODO: need to fix the size_type. All over the reading inputs are lno_t. template diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 1cdf1df7ee..845a162e51 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -61,6 +61,13 @@ struct DefaultComparator { }; } // namespace Impl +// ---------------------------------- +// BSR matrix/graph sorting utilities +// ---------------------------------- + +template +void sort_bsr_matrix(const bsrMat_t& A); + // ---------------------------------- // CRS matrix/graph sorting utilities // ---------------------------------- @@ -565,6 +572,70 @@ void sort_crs_matrix(const crsMat_t& A) { A.graph.row_map, A.graph.entries, A.values); } +namespace Impl { + +template +KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { + T t = a; + a = b; + b = t; +} + +} // namespace Impl + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + // TODO: this is O(N^2) mock for debugging - do regular implementation based + // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general + // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + const lno_t blocksize = blockdim * blockdim; + + assert(values.extent(0) == entries.extent(0) * blocksize); + Kokkos::parallel_for( + "sort_bsr_matrix", Kokkos::RangePolicy(0, numRows), + KOKKOS_LAMBDA(lno_t i) { + const lno_t rowStart = rowmap(i); + const lno_t rowSize = rowmap(i + 1) - rowStart; + auto* e = entries.data() + rowStart; + auto* v = values.data() + rowStart * blocksize; + bool done = false; + while (!done) { + done = true; + for (lno_t j = 1; j < rowSize; ++j) { + const lno_t jp = j - 1; + if (e[jp] <= e[j]) continue; + Impl::kk_swap(e[jp], e[j]); + auto const vb = v + j * blocksize; + auto const vbp = v + jp * blocksize; + for (lno_t k = 0; k < blocksize; + ++k) // std::swap_ranges(vb, vb + blocksize, vbp); + Impl::kk_swap(vb[k], vbp[k]); + done = false; + } + } + }); +} + +// Sort a BSR matrix (like CRS but single values are replaced with contignous +// blocks) +template +void sort_bsr_matrix(const bsrMat_t& A) { + // NOTE: unlike rowmap, entries and values are non-const, so we can sort them + // directly + sort_bsr_matrix( + A.blockDim(), A.graph.row_map, A.graph.entries, A.values); +} + // Sort a CRS graph: within each row, sort entries ascending by column. template void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 684b6855f2..65cbb40ca5 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -12,6 +12,7 @@ #include "Test_Sparse_spadd.hpp" #include "Test_Sparse_spgemm_jacobi.hpp" #include "Test_Sparse_spgemm.hpp" +#include "Test_Sparse_bspgemm.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" #include "Test_Sparse_spmv_blockcrs.hpp" diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp index 47b06b716a..4463eba503 100644 --- a/unit_test/sparse/Test_Sparse_bspgemm.hpp +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -47,65 +47,47 @@ #include "KokkosKernels_SparseUtils.hpp" #include "KokkosKernels_Sorting.hpp" -#include -#include -#include - #include "KokkosSparse_spgemm.hpp" -#include "KokkosSparse_CrsMatrix.hpp" - -#include -#include - -#include - -// This file contains the matrix for test_issue402 -#include "matrixIssue402.hpp" +#include "KokkosSparse_BsrMatrix.hpp" -// const char *input_filename = "sherman1.mtx"; -// const char *input_filename = "Si2.mtx"; -// const char *input_filename = "wathen_30_30.mtx"; -// const size_t expected_num_cols = 9906; using namespace KokkosSparse; -using namespace KokkosSparse::Experimental; -using namespace KokkosKernels; -using namespace KokkosKernels::Experimental; - -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif - -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; namespace Test { -template -int run_spgemm(crsMat_t A, crsMat_t B, - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, crsMat_t &C) { - typedef typename crsMat_t::size_type size_type; - typedef typename crsMat_t::ordinal_type lno_t; - typedef typename crsMat_t::value_type scalar_t; +template +int run_block_spgemm(const bsrMat_t A, const bsrMat_t B, bsrMat_t &C, + // parameters + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, + bool use_dynamic_scheduling = true, + size_t shmem_size = 0) { + typedef typename bsrMat_t::size_type size_type; + typedef typename bsrMat_t::ordinal_type lno_t; + typedef typename bsrMat_t::value_type scalar_t; + typedef typename bsrMat_t::device_type device; + typedef typename bsrMat_t::memory_space memory_space; typedef KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> + memory_space, memory_space> KernelHandle; KernelHandle kh; kh.set_team_work_size(16); - kh.set_dynamic_scheduling(true); + kh.set_dynamic_scheduling(use_dynamic_scheduling); kh.create_spgemm_handle(spgemm_algorithm); - KokkosSparse::spgemm_symbolic(kh, A, false, B, false, C); - KokkosSparse::spgemm_numeric(kh, A, false, B, false, C); + if (shmem_size > 0) { + kh.set_shmem_size(shmem_size); + } + KokkosSparse::block_spgemm_symbolic(kh, A, false, B, false, C); + KokkosSparse::block_spgemm_numeric(kh, A, false, B, false, C); kh.destroy_spgemm_handle(); return 0; } +#if 0 // not used in block SPGEMM template int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2, KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, @@ -166,12 +148,16 @@ int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2, return 0; } -template -bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; +#endif + +template +bool is_same_block_matrix(bsrMat_t output_mat_actual, + bsrMat_t output_mat_reference) { + using device = typename bsrMat_t::device_type; + using graph_t = typename bsrMat_t::StaticCrsGraphType; + using lno_view_t = typename graph_t::row_map_type::non_const_type; + using lno_nnz_view_t = typename graph_t::entries_type::non_const_type; + using scalar_view_t = typename bsrMat_t::values_type::non_const_type; size_t nrows_actual = output_mat_actual.numRows(); size_t nentries_actual = output_mat_actual.graph.entries.extent(0); @@ -197,8 +183,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { return false; } - KokkosKernels::sort_crs_matrix(output_mat_actual); - KokkosKernels::sort_crs_matrix(output_mat_reference); + KokkosKernels::sort_bsr_matrix(output_mat_actual); + KokkosKernels::sort_bsr_matrix(output_mat_reference); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -250,38 +236,43 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { // C := AB, where A is m*k, B is k*n, and C is m*n. template -void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, - lno_t row_size_variance, bool oldInterface = false) { +void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, + const bool use_dynamic_scheduling = true, + const size_t shared_memory_size = 0) { using namespace Test; // device::execution_space::initialize(); // device::execution_space::print_configuration(std::cout); - typedef CrsMatrix crsMat_t; - // typedef typename crsMat_t::StaticCrsGraphType graph_t; - // typedef typename graph_t::row_map_type::non_const_type lno_view_t; - // typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - // typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + using bsrMat_t = + KokkosSparse::Experimental::BsrMatrix; // Generate random compressed sparse row matrix. Randomly generated (non-zero) // values are stored in a 1-D (1 rank) array. - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( - m, k, nnz, row_size_variance, bandwidth); - crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( - k, n, nnz, row_size_variance, bandwidth); + bsrMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + blockDim, m, k, nnz, row_size_variance, bandwidth); + bsrMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( + blockDim, k, n, nnz, row_size_variance, bandwidth); const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; - crsMat_t output_mat2; - if (oldInterface) - run_spgemm_old_interface(A, B, SPGEMM_DEBUG, output_mat2); - else - run_spgemm(A, B, SPGEMM_DEBUG, output_mat2); + bsrMat_t output_mat2; + run_block_spgemm(A, B, output_mat2, SPGEMM_DEBUG, use_dynamic_scheduling, + shared_memory_size); std::vector algorithms = { - SPGEMM_KK, SPGEMM_KK_LP, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, + SPGEMM_KK, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ }; + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename device::execution_space>()) { + // SPGEMM_KK_LP is useful on CPU to cover MultiCoreTag4 functor + // (otherwise skipped) but on GPU it's same as SPGEMM_KK, so we can skip it. + algorithms.push_back(SPGEMM_KK_LP); + } + #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL algorithms.push_back(SPGEMM_MKL); #endif @@ -295,8 +286,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, case SPGEMM_CUSPARSE: // TODO: add these test failure cases for cusparse too. algo = "SPGEMM_CUSPARSE"; -#if !defined(KERNELS_HAVE_CUSPARSE) && \ - !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) +#ifndef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE is_expected_to_fail = true; #endif break; @@ -328,16 +318,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, } Kokkos::Timer timer1; - crsMat_t output_mat; + bsrMat_t output_mat; bool failed = false; int res = 0; try { - if (oldInterface) - res = run_spgemm_old_interface(A, B, spgemm_algorithm, - output_mat); - else - res = run_spgemm(A, B, spgemm_algorithm, output_mat); + res = run_block_spgemm(A, B, output_mat, spgemm_algorithm, + use_dynamic_scheduling, shared_memory_size); } catch (const char *message) { EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; failed = true; @@ -355,8 +342,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, timer1.reset(); if (!is_expected_to_fail) { EXPECT_TRUE((res == 0)) << algo; - bool is_identical = - is_same_matrix(output_mat, output_mat2); + bool is_identical = is_same_block_matrix(output_mat, output_mat2); EXPECT_TRUE(is_identical) << algo; // EXPECT_TRUE( equal) << algo; } @@ -366,6 +352,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, // device::execution_space::finalize(); } +#if 0 // TODO: specific SpGEMM case, not applicable in block version template void test_issue402() { @@ -432,28 +419,29 @@ void test_issue402() { EXPECT_TRUE(correctResult) << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n"; } +#endif -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F(TestCategory, \ - sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_spgemm(10000, 10000, 10000, \ - 10000 * 20, 500, 10, false); \ - test_spgemm(10000, 10000, 10000, \ - 10000 * 20, 500, 10, true); \ - test_spgemm(0, 0, 0, 0, 10, 10, false); \ - test_spgemm(0, 0, 0, 0, 10, 10, true); \ - test_spgemm(0, 12, 5, 0, 10, 0, false); \ - test_spgemm(0, 12, 5, 0, 10, 0, true); \ - test_spgemm(10, 10, 0, 0, 10, 10, false); \ - test_spgemm(10, 10, 0, 0, 10, 10, true); \ - test_spgemm(10, 10, 10, 0, 0, 0, false); \ - test_spgemm(10, 10, 10, 0, 0, 0, true); \ - test_issue402(); \ +// Note: Tests with shared memory specified aim to trigger specific GPU functors +// dispatched by matrix size and the available shared memory. +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse_block_spgemm_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + auto const SHMEM_AUTO = 0; \ + auto test_case = test_bspgemm; \ + /* Trigger SPGEMM_KK_MEMORY_SPREADTEAM on GPU */ \ + test_case(2, 50, 50, 50, 2000, 50, 5, true, 16 * 1024); \ + /* Trigger SPGEMM_KK -> SPGEMM_KK_MEMORY on GPU */ \ + test_case(2, 50, 50, 50, 1000, 50, 5, false, 16 * 1024); \ + /* Trigger SPGEMM_KK_MEMORY_BIGSPREADTEAM on GPU */ \ + test_case(2, 500, 500, 500, 32000, 500, 500, true, 16 * 1024); \ + /* trigger dense dispatch in hash method */ \ + test_case(2, 2, 3, 4, 2, 2, 0, true, 16 * 1024); \ + /* zero-size handling */ \ + test_case(2, 0, 0, 0, 0, 10, 10, true, SHMEM_AUTO); \ + test_case(2, 0, 12, 5, 0, 10, 0, true, SHMEM_AUTO); \ + test_case(2, 10, 10, 0, 0, 10, 10, true, SHMEM_AUTO); \ } -// test_spgemm(50000, 50000 * 30, 100, 10); -// test_spgemm(50000, 50000 * 30, 200, 10); - #include #undef KOKKOSKERNELS_EXECUTE_TEST From 40e8d851d922515ac087d4e301d31fc472483220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 14:40:33 +0100 Subject: [PATCH 093/261] ETI specializations for block SpGEMM --- src/CMakeLists.txt | 7 ++ ...parse_bspgemm_numeric_eti_spec_inst.cpp.in | 6 +- ...arse_bspgemm_numeric_eti_spec_avail.hpp.in | 8 +- ...parse_bspgemm_numeric_eti_spec_decl.hpp.in | 8 +- .../KokkosSparse_bspgemm_numeric_spec.hpp | 81 ++++++++++--------- 5 files changed, 63 insertions(+), 47 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a3460d1413..27f4c97aa5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -360,6 +360,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_bspgemm_numeric bspgemm_numeric + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in index 69f8fce032..eb5d74232e 100644 --- a/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in +++ b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in @@ -45,9 +45,9 @@ #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true #include "KokkosKernels_config.h" -#include "KokkosSparse_spgemm_numeric_spec.hpp" +#include "KokkosSparse_bspgemm_numeric_spec.hpp" namespace KokkosSparse { namespace Impl { -@SPARSE_SPGEMM_NUMERIC_ETI_INST_BLOCK@ - } //IMPL +@SPARSE_BSPGEMM_NUMERIC_ETI_INST_BLOCK@ + } //IMPL } //Kokkos \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in index c1edd15270..7159192433 100644 --- a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in @@ -1,5 +1,5 @@ -#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ /* //@HEADER // ************************************************************************ @@ -45,7 +45,7 @@ namespace KokkosSparse { namespace Impl { -@SPARSE_SPGEMM_NUMERIC_ETI_AVAIL_BLOCK@ - } //IMPL +@SPARSE_BSPGEMM_NUMERIC_ETI_AVAIL_BLOCK@ + } //IMPL } //Kokkos #endif \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in index 6b31499d52..5d63c640d6 100644 --- a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in @@ -1,5 +1,5 @@ -#ifndef KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ /* //@HEADER // ************************************************************************ @@ -45,7 +45,7 @@ namespace KokkosSparse { namespace Impl { -@SPARSE_SPGEMM_NUMERIC_ETI_DECL_BLOCK@ - } //IMPL +@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL } //Kokkos #endif \ No newline at end of file diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 0b28d2f02b..701106c623 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -41,8 +41,8 @@ // ************************************************************************ //@HEADER */ -#ifndef KOKKOSSPARSE_IMPL_SPGEMM_NUMERIC_SPEC_HPP_ -#define KOKKOSSPARSE_IMPL_SPGEMM_NUMERIC_SPEC_HPP_ +#ifndef KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_ #include @@ -68,18 +68,18 @@ template -struct spgemm_numeric_eti_spec_avail { +struct bspgemm_numeric_eti_spec_avail { enum : bool { value = false }; }; } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_AVAIL( \ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ template <> \ - struct spgemm_numeric_eti_spec_avail< \ + struct bspgemm_numeric_eti_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -114,7 +114,7 @@ struct spgemm_numeric_eti_spec_avail { }; \ \ template <> \ - struct spgemm_numeric_eti_spec_avail< \ + struct bspgemm_numeric_eti_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -149,59 +149,68 @@ struct spgemm_numeric_eti_spec_avail { }; // Include the actual specialization declarations -#include -#include +//#include +#include namespace KokkosSparse { namespace Impl { +// For future use (when TPL with block SpGEMM numeric phase is encountered) +template +struct bspgemm_numeric_tpl_spec_avail { + enum : bool { value = false }; +}; + // Unification layer -/// \brief Implementation of KokkosBlas::spgemm (sparse matrix - dense -/// vector multiply) for multiple vectors at a time (multivectors) -/// and possibly multiple coefficients at a time. +/// \brief Implementation of BSR sparse block matrix - matrix multiplication template ::value, - bool eti_spec_avail = spgemm_numeric_eti_spec_avail< + bool eti_spec_avail = bspgemm_numeric_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t, c_scalar_view_t>::value> -struct SPGEMM_NUMERIC { - static void spgemm_numeric(KernelHandle *handle, - typename KernelHandle::const_nnz_lno_t m, - typename KernelHandle::const_nnz_lno_t n, - typename KernelHandle::const_nnz_lno_t k, - a_size_view_t_ row_mapA, a_lno_view_t entriesA, - a_scalar_view_t valuesA, +struct BSPGEMM_NUMERIC { + static void bspgemm_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + typename KernelHandle::const_nnz_lno_t k, + typename KernelHandle::const_nnz_lno_t blockDim, + a_size_view_t_ row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, - bool transposeA, b_size_view_t_ row_mapB, - b_lno_view_t entriesB, b_scalar_view_t valuesB, - bool transposeB, c_size_view_t_ row_mapC, - c_lno_view_t &entriesC, c_scalar_view_t &valuesC); + bool transposeA, b_size_view_t_ row_mapB, + b_lno_view_t entriesB, b_scalar_view_t valuesB, + bool transposeB, c_size_view_t_ row_mapC, + c_lno_view_t &entriesC, c_scalar_view_t &valuesC); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -//! Full specialization of spgemm_mv for single vectors (2-D Views). +//! Full specialization of block spgemm // Unification layer template -struct SPGEMM_NUMERIC< +struct BSPGEMM_NUMERIC< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t, c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { - static void spgemm_numeric( + static void bspgemm_numeric( KernelHandle *handle, typename KernelHandle::nnz_lno_t m, typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, - a_size_view_t_ row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, + typename KernelHandle::const_nnz_lno_t blockDim, a_size_view_t_ row_mapA, + a_lno_view_t entriesA, a_scalar_view_t valuesA, bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB, b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC, @@ -292,10 +301,10 @@ struct SPGEMM_NUMERIC< } // namespace Impl } // namespace KokkosSparse -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_DECL( \ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ - extern template struct SPGEMM_NUMERIC< \ + extern template struct BSPGEMM_NUMERIC< \ typename KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -328,7 +337,7 @@ struct SPGEMM_NUMERIC< Kokkos::MemoryTraits >, \ false, true>; \ \ - extern template struct SPGEMM_NUMERIC< \ + extern template struct BSPGEMM_NUMERIC< \ typename KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -361,10 +370,10 @@ struct SPGEMM_NUMERIC< Kokkos::MemoryTraits >, \ false, true>; -#define KOKKOSSPARSE_SPGEMM_NUMERIC_ETI_SPEC_INST( \ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_INST( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ - template struct SPGEMM_NUMERIC< \ + template struct BSPGEMM_NUMERIC< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -397,7 +406,7 @@ struct SPGEMM_NUMERIC< Kokkos::MemoryTraits >, \ false, true>; \ \ - template struct SPGEMM_NUMERIC< \ + template struct BSPGEMM_NUMERIC< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ @@ -430,7 +439,7 @@ struct SPGEMM_NUMERIC< Kokkos::MemoryTraits >, \ false, true>; -#include -#include +//#include +#include #endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ From 736be462aed97ea70b2b554bc7a03c53fea69a4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 14:40:33 +0100 Subject: [PATCH 094/261] Implementation of serial/debug block SpGEMM --- src/sparse/KokkosSparse_spgemm.hpp | 56 +++++++++++++ src/sparse/KokkosSparse_spgemm_numeric.hpp | 28 ++++++- .../impl/KokkosSparse_bspgemm_impl_seq.hpp | 82 +++++++++++++------ .../KokkosSparse_bspgemm_numeric_spec.hpp | 9 +- 4 files changed, 144 insertions(+), 31 deletions(-) diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp index bdf4d0da75..0cee2979a2 100644 --- a/src/sparse/KokkosSparse_spgemm.hpp +++ b/src/sparse/KokkosSparse_spgemm.hpp @@ -81,6 +81,47 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, entriesC); } +// Symbolic phase for block SpGEMM (BSR matrices) +template +void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, + const bool transposeA, const BMatrixType& B, + const bool transposeB, CMatrixType& C) { + using row_map_type = typename CMatrixType::row_map_type::non_const_type; + using entries_type = typename CMatrixType::index_type::non_const_type; + using values_type = typename CMatrixType::values_type::non_const_type; + + auto blockDim = A.blockDim(); + if (blockDim != B.blockDim()) { + throw std::invalid_argument( + "Block SpGEMM must be called for matrices with the same block size"); + } + + row_map_type row_mapC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "non_const_lnow_row"), + A.numRows() + 1); + + KokkosSparse::Experimental::spgemm_symbolic( + &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map, + A.graph.entries, transposeA, B.graph.row_map, B.graph.entries, transposeB, + row_mapC); + + entries_type entriesC; + values_type valuesC; + const size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + if (c_nnz_size) { + entriesC = entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC = + values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), + c_nnz_size * blockDim * blockDim); + } + + C = CMatrixType("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, + row_mapC, entriesC, blockDim); +} + template void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -94,6 +135,21 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values); } +template +void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, + const BMatrix& B, const bool Bmode, CMatrix& C) { + auto blockDim = A.blockDim(); + if (blockDim != B.blockDim() or blockDim != C.blockDim()) { + throw std::invalid_argument( + "Block SpGEMM must be called for matrices with the same block size"); + } + + KokkosSparse::Experimental::spgemm_numeric( + &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map, + A.graph.entries, A.values, Amode, B.graph.row_map, B.graph.entries, + B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim); +} + } // namespace KokkosSparse #endif diff --git a/src/sparse/KokkosSparse_spgemm_numeric.hpp b/src/sparse/KokkosSparse_spgemm_numeric.hpp index 5bc791397c..313922dc62 100644 --- a/src/sparse/KokkosSparse_spgemm_numeric.hpp +++ b/src/sparse/KokkosSparse_spgemm_numeric.hpp @@ -46,11 +46,18 @@ #include "KokkosKernels_helpers.hpp" #include "KokkosSparse_spgemm_numeric_spec.hpp" +#include "KokkosSparse_bspgemm_numeric_spec.hpp" namespace KokkosSparse { namespace Experimental { +// +// NOTE: block_dim = 1 for CRS-formated views +// block_dim >= 1 for BSR-formatted views (bs=1 BSR is CRS) +// +// NOTE: Block CRS format is not yet supported ! +// template ::value, @@ -242,6 +251,23 @@ void spgemm_numeric(KernelHandle *handle, Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0)); Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0)); + if (block_dim > 1) { + KokkosSparse::Impl::BSPGEMM_NUMERIC< + const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, + Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_, + Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_, + Internal_clno_row_view_t_, Internal_clno_nnz_view_t_, + Internal_cscalar_nnz_view_t_>::bspgemm_numeric(&tmp_handle, m, n, k, + block_dim, const_a_r, + const_a_l, const_a_s, + transposeA, const_b_r, + const_b_l, const_b_s, + transposeB, nonconst_c_r, + nonconst_c_l, + nonconst_c_s); + return; + } + KokkosSparse::Impl::SPGEMM_NUMERIC< const_handle_type, // KernelHandle, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp index ce3501c447..7862268082 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp @@ -41,13 +41,29 @@ // ************************************************************************ //@HEADER */ -#ifndef KOKKOSSPARSE_SPGEMM_DEBUG_HPP_ -#define KOKKOSSPARSE_SPGEMM_DEBUG_HPP_ +#ifndef KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_ +#define KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_ #include "KokkosKernels_helpers.hpp" +#include "KokkosBatched_Gemm_Serial_Internal.hpp" +#include + namespace KokkosSparse { namespace Impl { +template +using kk_subview1d = + decltype(Kokkos::subview(data_view_t(), Kokkos::make_pair(0, 0))); + +// Returns subview +template +KOKKOS_INLINE_FUNCTION kk_subview1d get_block( + data_view_t data, size_type block_index, lno_t block_size) { + const auto i = block_index * block_size; + return Kokkos::subview(data, Kokkos::make_pair(i, i + block_size)); +} + +#if 0 // not used in block version template @@ -129,24 +145,26 @@ void spgemm_debug_symbolic(KernelHandle *handle, Kokkos::deep_copy(row_mapC, h_rmc); Kokkos::fence(); } +#endif template -void spgemm_debug_numeric(KernelHandle * /* handle */, - typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t /* n */, - typename KernelHandle::nnz_lno_t k, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - ascalar_nnz_view_t_ valuesA, - - bool /* transposeA */, blno_row_view_t_ row_mapB, - blno_nnz_view_t_ entriesB, - bscalar_nnz_view_t_ valuesB, bool /* transposeB */, - clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC, - cscalar_nnz_view_t_ valuesC) { +void bspgemm_debug_numeric(KernelHandle* /* handle */, + typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t /* n */, + typename KernelHandle::nnz_lno_t k, + typename KernelHandle::nnz_lno_t block_dim, + alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + ascalar_nnz_view_t_ valuesA, + + bool /* transposeA */, blno_row_view_t_ row_mapB, + blno_nnz_view_t_ entriesB, + bscalar_nnz_view_t_ valuesB, bool /* transposeB */, + clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC, + cscalar_nnz_view_t_ valuesC) { typename alno_row_view_t_::HostMirror h_rma = Kokkos::create_mirror_view(row_mapA); Kokkos::deep_copy(h_rma, row_mapA); @@ -179,8 +197,17 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, typedef typename KernelHandle::nnz_lno_t lno_t; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_scalar_t scalar_t; + typedef KokkosBatched::SerialGemmInternal< + KokkosBatched::Algo::Gemm::Unblocked> + GEMM; + + const auto block_size = block_dim * block_dim; + const auto ZERO = static_cast(0); + const auto ONE = static_cast(1); - std::vector accumulator(k, 0); + typename cscalar_nnz_view_t_::HostMirror accumulator("acc", k * block_size); + Kokkos::deep_copy(accumulator, ZERO); + Kokkos::fence(); std::vector acc_flag(k, false); h_rmc(0) = 0; @@ -194,33 +221,38 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, lno_t c_row_size_counter = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - scalar_t val = h_vala(ind); + size_type ind = a_row_begin + j; + lno_t col = h_enta(ind); + auto a_val = h_vala.data() + ind * block_size; // valuesA(i, col) const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { size_type ind_ = b_row_begin + z; lno_t b_col = h_entb(ind_); - scalar_t b_val = h_valb(ind_); + auto b_val = h_valb.data() + ind_ * block_size; // valuesB(col, b_col) if (acc_flag[b_col] == false) { acc_flag[b_col] = true; h_entc(c_row_begin + c_row_size_counter++) = b_col; } - accumulator[b_col] += b_val * val; + // accumulator(b_col) += a_val * b_val + auto acc = get_block(accumulator, b_col, block_size); + GEMM::invoke(block_dim, block_dim, block_dim, ONE, a_val, block_dim, 1, + b_val, block_dim, 1, ONE, acc.data(), block_dim, 1); } } // if (i == 0) std::cout << "result_cols" << std::endl; for (lno_t j = 0; j < c_row_size; ++j) { - size_type ind = c_row_begin + j; - lno_t result_col = h_entc(ind); - h_valc(ind) = accumulator[result_col]; - accumulator[result_col] = 0; - acc_flag[result_col] = false; + size_type ind = c_row_begin + j; + lno_t result_col = h_entc(ind); + auto acc = get_block(accumulator, result_col, block_size); + Kokkos::deep_copy(get_block(h_valc, ind, block_size), acc); + Kokkos::deep_copy(acc, ZERO); + Kokkos::fence(); + acc_flag[result_col] = false; } } diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 701106c623..658b2a1303 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -55,7 +55,7 @@ #include "KokkosSparse_spgemm_cuSPARSE_impl.hpp" #include "KokkosSparse_spgemm_CUSP_impl.hpp" #include "KokkosSparse_spgemm_impl.hpp" -#include "KokkosSparse_spgemm_impl_seq.hpp" +#include "KokkosSparse_bspgemm_impl_seq.hpp" #include "KokkosSparse_spgemm_mkl_impl.hpp" #include "KokkosSparse_spgemm_mkl2phase_impl.hpp" #include "KokkosSparse_spgemm_viennaCL_impl.hpp" @@ -287,10 +287,9 @@ struct BSPGEMM_NUMERIC< } break; case SPGEMM_SERIAL: case SPGEMM_DEBUG: - spgemm_debug_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, - - transposeA, row_mapB, entriesB, valuesB, - transposeB, row_mapC, entriesC, valuesC); + bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA, + valuesA, transposeA, row_mapB, entriesB, valuesB, + transposeB, row_mapC, entriesC, valuesC); break; } } From 03ab2786c59f6e8d47cc231989d9747690cb2221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 19 Jan 2022 14:40:33 +0100 Subject: [PATCH 095/261] Implementation of default block SpGEMM algorithm --- .../KokkosKernels_BlockHashmapAccumulator.hpp | 140 ++++-- src/common/KokkosKernels_BlockUtils.hpp | 144 ++++++ src/sparse/impl/KokkosSparse_bspgemm_impl.hpp | 391 ++++------------ .../impl/KokkosSparse_bspgemm_impl_def.hpp | 27 +- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 417 +++++++++--------- .../impl/KokkosSparse_bspgemm_impl_speed.hpp | 183 ++++---- .../KokkosSparse_bspgemm_numeric_spec.hpp | 14 +- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 4 +- 8 files changed, 670 insertions(+), 650 deletions(-) create mode 100644 src/common/KokkosKernels_BlockUtils.hpp diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp index b7f39f75c2..1777189612 100644 --- a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -41,10 +41,11 @@ // ************************************************************************ //@HEADER */ -#ifndef _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP -#define _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP +#ifndef _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP +#define _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP #include #include +#include "KokkosKernels_BlockUtils.hpp" //#define HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -52,6 +53,7 @@ namespace KokkosKernels { namespace Experimental { +#if 0 // defined in HashmapAccumulator header - include if needed or drop /** * @brief types of hash operations supported by HashmapAccumulator. * @@ -64,11 +66,12 @@ struct HashOpType { struct modulo {}; struct pow2Modulo {}; }; +#endif template /** - * \brief HashmapAccumulator class + * \brief BlockHashmapAccumulator class * The use of this is described in the paper: * "Performance-portable sparse matrix-matrix multiplication for many-core * architectures" ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in @@ -88,14 +91,14 @@ template = __max_value_size) { return __insert_full; } else { - keys[my_write_index] = key; - values[my_write_index] = value; + keys[my_write_index] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + my_write_index * block_size, valA, valB); #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ defined(KOKKOS_ARCH_AMPERE) @@ -487,8 +549,9 @@ struct HashmapAccumulator { KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( const team_member_t & /* teamMember */, const int /* vector_size */, - size_type hash, const key_type key, const value_type value, - volatile size_type *used_size_, const size_type max_value_size_) { + size_type hash, const key_type key, const value_type *valA, + const value_type *valB, volatile size_type *used_size_, + const size_type max_value_size_) { // Cannot compute hash here due to impl_speed use-case // hash = __compute_hash(key, __hashOpRHS); if (key == -1) return __insert_success; @@ -497,7 +560,8 @@ struct HashmapAccumulator { size_type i = hash_begins[hash]; for (; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { - values[i] = values[i] + value; + KokkosSparse::Impl::kk_block_add_mul( + block_dim, values + i * block_size, valA, valB); return __insert_success; } } @@ -516,8 +580,9 @@ struct HashmapAccumulator { if (my_write_index >= max_value_size_) { return __insert_full; } else { - keys[my_write_index] = key; - values[my_write_index] = value; + keys[my_write_index] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + my_write_index * block_size, valA, valB); #if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ defined(KOKKOS_ARCH_AMPERE) @@ -566,15 +631,17 @@ struct HashmapAccumulator { // used_size should be a shared pointer among the thread vectors KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd(const key_type key, - const value_type value, + const value_type *valA, + const value_type *valB, volatile size_type *used_size_) { if (key == -1) return __insert_success; return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( - nullptr, 0, __compute_hash(key, __hashOpRHS), key, value, used_size_, - __max_value_size); + nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, + used_size_, __max_value_size); } +#if 0 // used in symbolic of kkmem if the compression is not applied. KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash(const key_type &key, @@ -780,6 +847,7 @@ struct HashmapAccumulator { return __insert_success; } } +#endif // end public members private: size_type __max_value_size; @@ -813,7 +881,7 @@ struct HashmapAccumulator { return hash; } // private -}; // struct HashmapAccumulator +}; // struct BlockHashmapAccumulator } // namespace Experimental } // namespace KokkosKernels diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp new file mode 100644 index 0000000000..c6f9f55e3e --- /dev/null +++ b/src/common/KokkosKernels_BlockUtils.hpp @@ -0,0 +1,144 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSKERNELS_BLOCKUTILS_HPP +#define _KOKKOSKERNELS_BLOCKUTILS_HPP + +// #include +// #include +#include "KokkosBatched_Gemm_Serial_Internal.hpp" + +namespace KokkosSparse { +namespace Impl { + +// Initializes block: A = [val, val, val, ....] +template +KOKKOS_INLINE_FUNCTION void kk_block_init( + const size_type block_dim, value_type *dst, + const value_type val = static_cast( + 0)) { // Note: replaces __host__ std::fill() not to be called from GPU + for (auto end = dst + (block_dim * block_dim); dst < end; ++dst) { + *dst = val; + } +} + +// Initializes block: A = B +template +KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, + value_type *dst, + const value_type *val) { + memcpy(dst, val, block_dim * block_dim * sizeof(value_type)); +} + +// Performs A += B on blocks +template +KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, + value_type *dst, + const value_type *val) { + const auto end = dst + block_dim * block_dim; + while (dst < end) { + *(dst++) += *(val++); + } +} + +// Performs C += A * B on blocks +// Note: block is assumed to be row-major, dense matrix (no extra padding) +// Note: set clear=true to set C = 0 before increment +template > +KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, + value_type *dst, + const value_type *valA, + const value_type *valB, + const bool clear = false) { + const auto ZERO = static_cast(0); + const auto ONE = static_cast(1); + DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, + block_dim, 1, clear ? ZERO : ONE, dst, block_dim, 1); +} + +// dgemm: C = A * B +template +KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, + value_type *c_val, + const value_type *a_val, + const value_type *b_val) { + kk_block_dgemm(block_dim, c_val, a_val, b_val, true); +} + +// dgemm: C += A * B +template +KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, + value_type *c_val, + const value_type *a_val, + const value_type *b_val) { + kk_block_dgemm(block_dim, c_val, a_val, b_val, false); +} + +// Performs C += A * B (dense GEMM) on blocks +// Note: all pointers reference dense row-major blocks (no extra padding) +template +KOKKOS_INLINE_FUNCTION void kk_vector_block_mul_add(const size_type block_dim, + value_type *dst, + const value_type *valA, + const value_type *valB) { + // NOTE: this should be replaced by batched DGEMM + // once atomic increment is supported there + for (size_type row = 0; row < block_dim; ++row) { + auto const row_offset = row * block_dim; + for (size_type col = 0; col < block_dim; ++col) { + auto v = &dst[row_offset + col]; + auto vb = valB + col; + for (auto va = valA + row_offset, end = va + block_dim; va < end; ++va) { + Kokkos::atomic_add(v, (*va) * (*vb)); + vb += block_dim; + } + } + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // _KOKKOSKERNELS_BLOCKUTILS_HPP diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp index 09a8bf212a..d015778ca1 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -42,8 +42,8 @@ //@HEADER */ -#ifndef _KOKKOSSPGEMMIMPL_HPP -#define _KOKKOSSPGEMMIMPL_HPP +#ifndef _KOKKOSBSPGEMMIMPL_HPP +#define _KOKKOSBSPGEMMIMPL_HPP //#define KOKKOSKERNELS_ANALYZE_COMPRESSION //#define KOKKOSKERNELS_ANALYZE_MEMORYACCESS @@ -53,19 +53,8 @@ //#define GPU_EXPERIMENTAL //#define NUMERIC_USE_STATICMEM //#define twostep -#include -#include -#include -#include -#include -#include -#include -#include - -#include "KokkosKernels_HashmapAccumulator.hpp" -#include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" -#include "KokkosSparse_spgemm_handle.hpp" -#include "KokkosGraph_Distance1Color.hpp" + +#include "KokkosSparse_spgemm_impl.hpp" namespace KokkosSparse { @@ -75,213 +64,46 @@ template -class KokkosSPGEMM { +class KokkosBSPGEMM + : public KokkosSPGEMM { public: - typedef a_row_view_t_ a_row_view_t; - typedef a_lno_nnz_view_t_ a_in_lno_nnz_view_t; - typedef a_scalar_nnz_view_t_ a_in_scalar_nnz_view_t; - - typedef b_lno_row_view_t_ b_in_lno_row_view_t; - typedef b_lno_nnz_view_t_ b_in_lno_nnz_view_t; - typedef b_scalar_nnz_view_t_ b_in_scalar_nnz_view_t; - - typedef typename a_row_view_t::non_const_value_type size_type; - typedef typename a_row_view_t::const_value_type const_size_type; - - typedef typename a_in_lno_nnz_view_t::non_const_value_type nnz_lno_t; - typedef typename a_in_lno_nnz_view_t::const_value_type const_nnz_lno_t; - - typedef typename a_in_scalar_nnz_view_t::non_const_value_type scalar_t; - typedef typename a_in_scalar_nnz_view_t::const_value_type const_scalar_t; - - typedef typename a_row_view_t::const_type const_a_lno_row_view_t; - typedef typename a_row_view_t::non_const_type non_const_a_lno_row_view_t; - - typedef typename a_in_lno_nnz_view_t::const_type const_a_lno_nnz_view_t; - typedef - typename a_in_lno_nnz_view_t::non_const_type non_const_a_lno_nnz_view_t; - - typedef typename a_in_scalar_nnz_view_t::const_type const_a_scalar_nnz_view_t; - typedef typename a_in_scalar_nnz_view_t::non_const_type - non_const_a_scalar_nnz_view_t; - - typedef typename b_in_lno_row_view_t::const_type const_b_lno_row_view_t; - typedef - typename b_in_lno_row_view_t::non_const_type non_const_b_lno_row_view_t; - - typedef typename b_in_lno_nnz_view_t::const_type const_b_lno_nnz_view_t; - typedef - typename b_in_lno_nnz_view_t::non_const_type non_const_b_lno_nnz_view_t; - - typedef typename b_in_scalar_nnz_view_t::const_type const_b_scalar_nnz_view_t; - typedef typename b_in_scalar_nnz_view_t::non_const_type - non_const_b_scalar_nnz_view_t; - - typedef typename HandleType::HandleExecSpace MyExecSpace; - typedef typename HandleType::HandleTempMemorySpace MyTempMemorySpace; - typedef - typename HandleType::HandlePersistentMemorySpace MyPersistentMemorySpace; - - typedef - typename HandleType::row_lno_temp_work_view_t row_lno_temp_work_view_t; - typedef typename HandleType::row_lno_persistent_work_view_t - row_lno_persistent_work_view_t; - typedef typename HandleType::row_lno_persistent_work_host_view_t - row_lno_persistent_work_host_view_t; // Host view type - - typedef - typename HandleType::nnz_lno_temp_work_view_t nnz_lno_temp_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_view_t - nnz_lno_persistent_work_view_t; - typedef typename HandleType::nnz_lno_persistent_work_host_view_t - nnz_lno_persistent_work_host_view_t; // Host view type - - typedef typename HandleType::scalar_temp_work_view_t scalar_temp_work_view_t; - typedef typename HandleType::scalar_persistent_work_view_t - scalar_persistent_work_view_t; - - typedef typename HandleType::bool_persistent_view_t bool_persistent_view_t; - typedef typename HandleType::bool_temp_view_t bool_temp_view_t; - - typedef Kokkos::RangePolicy my_exec_space; - typedef Kokkos::TeamPolicy team_policy_t; - typedef typename team_policy_t::member_type team_member_t; - - struct CountTag {}; - struct GPUCountTag {}; - struct CountTag2 {}; - - struct FillTag {}; - struct FillTag2 {}; - struct MultiCoreDenseAccumulatorTag {}; - struct MultiCoreDenseAccumulatorTag2 {}; - struct MultiCoreDenseAccumulatorTag3 {}; - struct NoCompressMultiCoreDenseAccumulatorTag {}; - struct NoCompressMultiCoreDenseAccumulatorTag2 {}; - struct NoCompressMultiCoreDenseAccumulatorTag3 {}; - struct MultiCoreTag {}; - struct MultiCoreTag2 {}; - struct MultiCoreTag3 {}; - struct MultiCoreTag4 {}; - struct MultiCoreTag5 {}; - struct MultiCoreTag6 {}; - struct GPUTag {}; - struct GPUTag2 {}; - struct GPUTag3 {}; - struct GPUTag4 {}; - struct GPUTag5 {}; - struct GPUTag6 {}; - - struct Numeric1Tag {}; - struct Numeric2Tag {}; - struct Numeric3Tag {}; - - typedef Kokkos::TeamPolicy - multicore_dense_team_count_policy_t; - typedef Kokkos::TeamPolicy - multicore_dense_team2_count_policy_t; - typedef Kokkos::TeamPolicy - multicore_dense_team3_count_policy_t; - - typedef Kokkos::TeamPolicy - nc_multicore_dense_team_count_policy_t; - typedef Kokkos::TeamPolicy - nc_multicore_dense_team2_count_policy_t; - typedef Kokkos::TeamPolicy - nc_multicore_dense_team3_count_policy_t; - - typedef Kokkos::TeamPolicy > - nc_dynamic_multicore_dense_team_count_policy_t; - typedef Kokkos::TeamPolicy > - nc_dynamic_multicore_dense_team2_count_policy_t; - typedef Kokkos::TeamPolicy > - nc_dynamic_multicore_dense_team3_count_policy_t; - - typedef Kokkos::TeamPolicy multicore_team_policy_t; - typedef Kokkos::TeamPolicy - multicore_team_policy2_t; - typedef Kokkos::TeamPolicy - multicore_team_policy3_t; - typedef Kokkos::TeamPolicy - multicore_team_policy4_t; - typedef Kokkos::TeamPolicy - multicore_team_policy5_t; - typedef Kokkos::TeamPolicy - multicore_team_policy6_t; - - typedef Kokkos::TeamPolicy gpu_team_policy_t; - typedef Kokkos::TeamPolicy gpu_team_policy2_t; - typedef Kokkos::TeamPolicy gpu_team_policy3_t; - typedef Kokkos::TeamPolicy gpu_team_policy4_t; - typedef Kokkos::TeamPolicy gpu_team_policy5_t; - typedef Kokkos::TeamPolicy gpu_team_policy6_t; - - typedef Kokkos::TeamPolicy team_count_policy_t; - typedef Kokkos::TeamPolicy team_count2_policy_t; - - typedef Kokkos::TeamPolicy team_gpucount_policy_t; - - typedef Kokkos::TeamPolicy team_fill_policy_t; - typedef Kokkos::TeamPolicy team_fill2_policy_t; - - typedef Kokkos::TeamPolicy team_numeric1_policy_t; - typedef Kokkos::TeamPolicy team_numeric2_policy_t; - typedef Kokkos::TeamPolicy team_numeric3_policy_t; - - typedef Kokkos::TeamPolicy > - dynamic_multicore_dense_team_count_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_dense_team2_count_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_dense_team3_count_policy_t; - - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy2_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy3_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy4_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy5_t; - typedef Kokkos::TeamPolicy > - dynamic_multicore_team_policy6_t; - - typedef Kokkos::TeamPolicy > - dynamic_team_count_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_team_fill_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_team_numeric1_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_team_numeric2_policy_t; - typedef Kokkos::TeamPolicy > - dynamic_team_numeric3_policy_t; - - typedef Kokkos::TeamPolicy > - dynamic_team_policy_t; - + using Base = KokkosSparse::Impl::KokkosSPGEMM< + HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_, + b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>; + +#define USE_BASE_TYPE(type) using type = typename Base::type; + + USE_BASE_TYPE(nnz_lno_t) + USE_BASE_TYPE(scalar_t) + USE_BASE_TYPE(size_type) + USE_BASE_TYPE(const_a_lno_row_view_t) + USE_BASE_TYPE(const_a_lno_nnz_view_t) + USE_BASE_TYPE(const_a_scalar_nnz_view_t) + USE_BASE_TYPE(const_b_lno_row_view_t) + USE_BASE_TYPE(const_b_lno_nnz_view_t) + USE_BASE_TYPE(const_b_scalar_nnz_view_t) + USE_BASE_TYPE(row_lno_persistent_work_view_t) + USE_BASE_TYPE(nnz_lno_temp_work_view_t) + USE_BASE_TYPE(team_member_t) + + USE_BASE_TYPE(MyExecSpace) + USE_BASE_TYPE(MyTempMemorySpace) + USE_BASE_TYPE(MultiCoreTag) + USE_BASE_TYPE(MultiCoreTag4) + USE_BASE_TYPE(GPUTag) + USE_BASE_TYPE(GPUTag4) + USE_BASE_TYPE(GPUTag6) + USE_BASE_TYPE(gpu_team_policy_t) + USE_BASE_TYPE(gpu_team_policy4_t) + USE_BASE_TYPE(gpu_team_policy6_t) + USE_BASE_TYPE(dynamic_multicore_team_policy_t) + USE_BASE_TYPE(dynamic_multicore_team_policy4_t) + USE_BASE_TYPE(multicore_team_policy_t) + USE_BASE_TYPE(multicore_team_policy4_t) + +#if 0 // defined in base class (clean up or implement block version) private: HandleType *handle; nnz_lno_t a_row_cnt; @@ -391,6 +213,7 @@ class KokkosSPGEMM { template void KokkosSPGEMM_numeric_triangle_ai(c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_); +#endif public: ////////////////////////////////////////////////////////////////////////// @@ -417,11 +240,12 @@ class KokkosSPGEMM { */ template - void KokkosSPGEMM_numeric_speed( + void KokkosBSPGEMM_numeric_speed( c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType my_exec_space); +#if 0 public: /* ////////////////////////////////////////////////////////////////////////// @@ -458,6 +282,22 @@ class KokkosSPGEMM { nnz_lno_t &num_multi_color_steps, SPGEMMAlgorithm spgemm_algorithm); */ +#endif + private: + // How many extra bytes are needed to align a scalar_t after an array of + // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per + // team or per thread depending on algorithm + static constexpr size_t scalarAlignPad = + (alignof(scalar_t) > alignof(nnz_lno_t)) + ? (alignof(scalar_t) - alignof(nnz_lno_t)) + : 0; + + static constexpr bool exec_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + + private: + nnz_lno_t block_dim; + public: ////////////////////////////////////////////////////////////////////////// /////BELOW CODE IS TO for kkmem SPGEMM @@ -470,16 +310,14 @@ class KokkosSPGEMM { typename c_scalar_view_t, typename pool_memory_type> struct PortableNumericCHASH; - private: - // KKMEM only difference is work memory does not use output memory for 2nd - // level accumulator. template - void KokkosSPGEMM_numeric_hash2( + void KokkosBSPGEMM_numeric_hash( c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType my_exec_space); +#if 0 // defined in base class (clean up or implement block version) template void KokkosSPGEMM_numeric_hash( @@ -586,6 +424,7 @@ class KokkosSPGEMM { // 4-KKMULTICOLOR2 ); +#endif #endif public: @@ -595,11 +434,13 @@ class KokkosSPGEMM { ////////////////////////////////////////////////////////////////////////// template - void KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, - c_scalar_nnz_view_t &valuesC_); + void KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, + c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_); // TODO: These are references only for outer product algorithm. // If the algorithm is removed, then remove the references. +#if 0 /** * \brief Symbolic phase of the SPGEMM. * \param rowmapC_: row pointers for the result matrix. Allocated before the @@ -614,67 +455,29 @@ class KokkosSPGEMM { nnz_lno_persistent_work_view_t &color_adj, c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_); +#endif - KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, - const_a_lno_row_view_t row_mapA_, - const_a_lno_nnz_view_t entriesA_, bool transposeA_, - const_b_lno_row_view_t row_mapB_, - const_b_lno_nnz_view_t entriesB_, bool transposeB_) - : handle(handle_), - a_row_cnt(m_), - b_row_cnt(n_), - b_col_cnt(k_), - row_mapA(row_mapA_), - entriesA(entriesA_), - valsA(), - transposeA(transposeA_), - row_mapB(row_mapB_), - entriesB(entriesB_), - valsB(), - transposeB(transposeB_), - shmem_size(handle_->get_shmem_size()), - concurrency(MyExecSpace::concurrency()), - use_dynamic_schedule(handle_->is_dynamic_scheduling()), - KOKKOSKERNELS_VERBOSE(handle_->get_verbose()), - MyEnumExecSpace(this->handle->get_handle_exec_space()), - spgemm_algorithm( - this->handle->get_spgemm_handle()->get_algorithm_type()), - spgemm_accumulator( - this->handle->get_spgemm_handle()->get_accumulator_type()) - //,row_mapC(), entriesC(), valsC() - {} - - KokkosSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, - const_a_lno_row_view_t row_mapA_, - const_a_lno_nnz_view_t entriesA_, - const_a_scalar_nnz_view_t valsA_, bool transposeA_, - const_b_lno_row_view_t row_mapB_, - const_b_lno_nnz_view_t entriesB_, - const_b_scalar_nnz_view_t valsB_, bool transposeB_) - : handle(handle_), - a_row_cnt(m_), - b_row_cnt(n_), - b_col_cnt(k_), - row_mapA(row_mapA_), - entriesA(entriesA_), - valsA(valsA_), - transposeA(transposeA_), - row_mapB(row_mapB_), - entriesB(entriesB_), - valsB(valsB_), - transposeB(transposeB_), - shmem_size(handle_->get_shmem_size()), - concurrency(MyExecSpace::concurrency()), - use_dynamic_schedule(handle_->is_dynamic_scheduling()), - KOKKOSKERNELS_VERBOSE(handle_->get_verbose()), - MyEnumExecSpace(this->handle->get_handle_exec_space()), - spgemm_algorithm( - this->handle->get_spgemm_handle()->get_algorithm_type()), - spgemm_accumulator( - this->handle->get_spgemm_handle()->get_accumulator_type()) - //,row_mapB(), entriesC(), valsC() - {} - + KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, bool transposeB_) + : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, transposeA_, row_mapB_, + entriesB_, transposeB_), + block_dim(block_dim_) {} + + KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, + const_a_scalar_nnz_view_t valsA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, + const_b_scalar_nnz_view_t valsB_, bool transposeB_) + : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, valsA_, transposeA_, + row_mapB_, entriesB_, valsB_, transposeB_), + block_dim(block_dim_) {} + +#if 0 // defined in base class (clean up or implement block version) ////////////////////////////////////////////////////////////////////////// /////BELOW CODE IS for symbolic phase ////DECL IS AT _symbolic.hpp @@ -837,16 +640,12 @@ class KokkosSPGEMM { } return po2_num_chunks; } +#endif }; } // namespace Impl } // namespace KokkosSparse -#include "KokkosSparse_spgemm_imp_outer.hpp" -#include "KokkosSparse_spgemm_impl_memaccess.hpp" -#include "KokkosSparse_spgemm_impl_kkmem.hpp" -#include "KokkosSparse_spgemm_impl_speed.hpp" -#include "KokkosSparse_spgemm_impl_compression.hpp" -#include "KokkosSparse_spgemm_impl_def.hpp" -#include "KokkosSparse_spgemm_impl_symbolic.hpp" -#include "KokkosSparse_spgemm_impl_triangle.hpp" +#include "KokkosSparse_bspgemm_impl_kkmem.hpp" +#include "KokkosSparse_bspgemm_impl_speed.hpp" +#include "KokkosSparse_bspgemm_impl_def.hpp" #endif diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp index 173a58b568..c4ecbd6503 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp @@ -52,32 +52,32 @@ template template -void KokkosSPGEMM< - HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_, - b_lno_row_view_t_, b_lno_nnz_view_t_, - b_scalar_nnz_view_t_>::KokkosSPGEMM_numeric(c_row_view_t &rowmapC_, - c_lno_nnz_view_t &entriesC_, - c_scalar_nnz_view_t &valuesC_) { +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_) { // get the algorithm and execution space. // SPGEMMAlgorithm spgemm_algorithm = // this->handle->get_spgemm_handle()->get_algorithm_type(); KokkosKernels::Impl::ExecSpaceType my_exec_space_ = KokkosKernels::Impl::get_exec_space_type(); - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "Numeric PHASE" << std::endl; } - if (spgemm_algorithm == SPGEMM_KK_SPEED || - spgemm_algorithm == SPGEMM_KK_DENSE) { - this->KokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, - my_exec_space_); + if (Base::spgemm_algorithm == SPGEMM_KK_SPEED || + Base::spgemm_algorithm == SPGEMM_KK_DENSE) { + this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + my_exec_space_); } else { - this->KokkosSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, - my_exec_space_); + this->KokkosBSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, + my_exec_space_); } } +#if 0 // symbolic not needed in BSPGEMM template -struct KokkosSPGEMM::PortableNumericCHASH { + using BlockAccumulator = KokkosKernels::Experimental::BlockHashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd>; + + static constexpr auto scalarAlignPad = + KokkosBSPGEMM::PortableNumericCHASH { + b_scalar_nnz_view_t_>::scalarAlignPad; nnz_lno_t numrows; + nnz_lno_t block_dim; + const nnz_lno_t block_size; + size_t block_bytes; a_row_view_t row_mapA; a_nnz_view_t entriesA; @@ -106,8 +118,8 @@ struct KokkosSPGEMM alignof(nnz_lno_t)) - ? (alignof(scalar_t) - alignof(nnz_lno_t)) - : 0; team_shmem_key_size = ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory); @@ -202,13 +210,13 @@ struct KokkosSPGEMM> 1) << 1; thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / - (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + (sizeof(nnz_lno_t) * 2 + block_bytes); thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; if (KOKKOSKERNELS_VERBOSE_) { @@ -292,6 +300,9 @@ struct KokkosSPGEMM(tmp); + BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, hash_ids, hash_values); + Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&](const nnz_lno_t &row_index) { @@ -300,9 +311,9 @@ struct KokkosSPGEMM - hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL); + BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, nullptr, nullptr); volatile nnz_lno_t *tmp = NULL; size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); @@ -372,15 +366,15 @@ struct KokkosSPGEMM(all_shared_memory); - KokkosKernels::Experimental::HashmapAccumulator< - nnz_lno_t, nnz_lno_t, scalar_t, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hm(thread_shmem_key_size, thread_shared_memory_hash_func, begins, nexts, - keys, vals); + BlockAccumulator hm(block_dim, thread_shmem_key_size, + thread_shared_memory_hash_func, begins, nexts, keys, + vals); - KokkosKernels::Experimental::HashmapAccumulator< - nnz_lno_t, nnz_lno_t, scalar_t, - KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hm2(pow2_hash_size, pow2_hash_func, NULL, NULL, NULL, NULL); + BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, nullptr, nullptr); Kokkos::parallel_for( Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), [&](const nnz_lno_t &row_index) { @@ -575,7 +567,7 @@ struct KokkosSPGEMM 0) { - size_type a_col = col_begin + ii; - nnz_lno_t rowB = entriesA[a_col]; - scalar_t valA = valuesA[a_col]; + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *valA = valuesA.data() + a_col * block_size; size_type rowBegin = row_mapB(rowB); nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin; @@ -605,13 +597,13 @@ struct KokkosSPGEMM= current_a_column_flops); - a_col_val = - valuesA[a_col_begin_offset + current_a_column_offset_inrow]; + const auto idx = a_col_begin_offset + current_a_column_offset_inrow; + a_val = valuesA.data() + idx * block_size; } - my_b_col = entriesB[my_b_col_shift + current_b_read_offsett]; - my_b_val = - valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val; + const auto idx = my_b_col_shift + current_b_read_offsett; + my_b_col = entriesB[idx]; + const scalar_t *b_val = valuesB.data() + idx * block_size; // now insert it to first level hashmap accumulator. hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; fail = 1; @@ -814,7 +810,8 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; @@ -839,7 +837,8 @@ struct KokkosSPGEMM max_first_level_hash_size) insert_is_on = false; @@ -864,15 +864,18 @@ struct KokkosSPGEMM 0; + trial = (trial + 1) & team_cuckoo_hash_func) { if (keys[trial] == my_b_col) { - vals[trial] += my_b_val; + kk_block_add(block_dim, vals + trial * block_size, b_val); fail = 0; break; } else if (keys[trial] == init_value) { @@ -950,7 +941,8 @@ struct KokkosSPGEMM= current_a_column_flops); - a_col_val = - valuesA[a_col_begin_offset + current_a_column_offset_inrow]; + const auto idx = a_col_begin_offset + current_a_column_offset_inrow; + a_val = valuesA.data() + idx * block_size; } my_b_col = entriesB[my_b_col_shift + current_b_read_offsett]; - my_b_val = - valuesB[my_b_col_shift + current_b_read_offsett] * a_col_val; + const auto idx = my_b_col_shift + current_b_read_offsett; + const scalar_t *b_val = valuesB.data() + idx * block_size; // now insert it to first level hashmap accumulator. hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; @@ -1134,13 +1127,15 @@ struct KokkosSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == // this->spgemm_algorithm) : -// call KokkosSPGEMM_numeric_speed(...) +// call KokkosBSPGEMM_numeric_speed(...) // else: -// call KokkosSPGEMM_numeric_hash(...) (this code!) +// call KokkosBSPGEMM_numeric_hash(...) (this code!) // -// * NOTE: KokkosSPGEMM_numeric_hash2(...) is not called +// * NOTE: KokkosBSPGEMM_numeric_hash2(...) is not called // // -// KokkosSPGEMM_numeric_hash: +// KokkosBSPGEMM_numeric_hash: // // Algorithm selection may be modified as follows // // algorithm_to_run: initialized to spgemm_algorithm input to -// KokkosSPGEMM_numeric_hash +// KokkosBSPGEMM_numeric_hash // * spgemm_algorithm CANNOT be SPGEMM_KK_SPEED or SPGEMM_KK_DENSE // // if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == @@ -1225,7 +1223,7 @@ struct KokkosSPGEMMKokkosSPGEMM_numeric_speed" +// 2. if dense: call "this->KokkosBSPGEMM_numeric_speed" // else : no change from algorithm_to_run; that is algorithm_to_run == // SPGEMM_KK || SPGEMM_KK_LP // @@ -1262,25 +1260,25 @@ template template -void KokkosSPGEMM:: - KokkosSPGEMM_numeric_hash( +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric_hash( c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\tHASH MODE" << std::endl; } KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm; - nnz_lno_t brows = row_mapB.extent(0) - 1; - size_type bnnz = valsB.extent(0); + nnz_lno_t brows = Base::row_mapB.extent(0) - 1; + size_type bnnz = Base::valsB.extent(0); int suggested_vector_size = this->handle->get_suggested_vector_size(brows, bnnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); - size_t shmem_size_to_use = shmem_size; + size_t shmem_size_to_use = Base::shmem_size; row_lno_persistent_work_view_t flops_per_row = this->handle->get_spgemm_handle()->row_flops; @@ -1306,19 +1304,12 @@ void KokkosSPGEMM alignof(nnz_lno_t)) - ? (alignof(scalar_t) - alignof(nnz_lno_t)) - : 0; - // START OF SHARED MEMORY SIZE CALCULATIONS // NOTE: the values computed here are not actually passed to functors // requiring shmem, the calculations here are used for algorithm selection + const size_t block_bytes = sizeof(scalar_t) * block_dim * block_dim; nnz_lno_t unit_memory = - sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof(scalar_t); + sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + block_bytes; nnz_lno_t team_shmem_key_size = ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / unit_memory); @@ -1328,7 +1319,7 @@ void KokkosSPGEMM> 1) << 1; thread_shmem_key_size = thread_shmem_key_size + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / - (sizeof(nnz_lno_t) * 2 + sizeof(scalar_t)); + (sizeof(nnz_lno_t) * 2 + block_bytes); thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; // choose parameters @@ -1397,11 +1388,11 @@ void KokkosSPGEMM> 1) << 1; } - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" << suggested_vector_size << " suggested_team_size:" << suggested_team_size @@ -1410,7 +1401,7 @@ void KokkosSPGEMMb_col_cnt; if (col_size < max_column_cut_off) { run_dense = true; - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl; @@ -1498,18 +1489,18 @@ void KokkosSPGEMM= dense_chunksize * 0.5) { run_dense = true; - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" << kkmem_chunksize << " dense_chunksize:" << dense_chunksize << std::endl; } } else { run_dense = false; - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size << " max_column_cut_off:" << max_column_cut_off << std::endl; @@ -1518,15 +1509,15 @@ void KokkosSPGEMMKokkosSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, - lcl_my_exec_space); + this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + lcl_my_exec_space); return; } } } nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, concurrency, a_row_cnt); - if (KOKKOSKERNELS_VERBOSE) { + suggested_team_size, this->concurrency, this->a_row_cnt); + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" << thread_shmem_hash_size << " thread_shmem_key_size:" << thread_shmem_key_size @@ -1559,7 +1550,7 @@ void KokkosSPGEMM min_hash_size) { @@ -1568,7 +1559,7 @@ void KokkosSPGEMM min_hash_size) { @@ -1581,14 +1572,15 @@ void KokkosSPGEMMtemplate compute_num_pool_chunks( - chunksize * sizeof(nnz_lno_t), concurrency / suggested_vector_size); + chunksize * sizeof(nnz_lno_t), + this->concurrency / suggested_vector_size); // END SIZE CALCULATIONS FOR MEMORYPOOL - if (KOKKOSKERNELS_VERBOSE) { + if (this->KOKKOSKERNELS_VERBOSE) { std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize << " min_hash_size:" << min_hash_size - << " concurrency:" << concurrency + << " concurrency:" << this->concurrency << " MyExecSpace::concurrency():" << MyExecSpace::concurrency() << " numchunks:" << num_chunks << std::endl; } @@ -1604,7 +1596,7 @@ void KokkosSPGEMMKOKKOSKERNELS_VERBOSE) { m_space.print_memory_pool(); std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; std::cout << "\t\tPool Size(MB):" @@ -1616,18 +1608,17 @@ void KokkosSPGEMM - sc(a_row_cnt, row_mapA, entriesA, valsA, - - row_mapB, entriesB, valsB, + sc(block_dim, this->a_row_cnt, Base::row_mapA, Base::entriesA, + Base::valsA, Base::row_mapB, Base::entriesB, Base::valsB, rowmapC_, entriesC_, valuesC_, shmem_size_to_use, suggested_vector_size, m_space, min_hash_size, max_nnz, suggested_team_size, lcl_my_exec_space, team_row_chunk_size, first_level_cut_off, - flops_per_row, KOKKOSKERNELS_VERBOSE); + flops_per_row, this->KOKKOSKERNELS_VERBOSE); - if (KOKKOSKERNELS_VERBOSE) { + if (this->KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tvector_size:" << suggested_vector_size << " chunk_size:" << team_row_chunk_size << " suggested_team_size:" << suggested_team_size << std::endl; @@ -1637,85 +1628,85 @@ void KokkosSPGEMM()) { if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { if (thread_shmem_key_size <= 0) { - std::cout << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " "Insufficient shmem available for key for hash map " "accumulator - Terminating" << std::endl; std::cout << " thread_shmem_key_size = " << thread_shmem_key_size << std::endl; throw std::runtime_error( - " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " "Insufficient shmem available for key for hash map accumulator "); } Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM", - gpu_team_policy4_t(a_row_cnt / team_row_chunk_size + 1, + gpu_team_policy4_t(this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); MyExecSpace().fence(); } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { if (thread_shmem_key_size <= 0) { - std::cout << "KokkosSPGEMM_numeric_hash " + std::cout << "KokkosBSPGEMM_numeric_hash " "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem " "available for key for hash map accumulator - Terminating" << std::endl; std::cout << " thread_shmem_key_size = " << thread_shmem_key_size << std::endl; throw std::runtime_error( - " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: " + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: " "Insufficient shmem available for key for hash map accumulator "); } Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM", - gpu_team_policy6_t(a_row_cnt / team_row_chunk_size + 1, + gpu_team_policy6_t(this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } else { if (team_shmem_key_size <= 0) { - std::cout - << "KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " - "available for key for hash map accumulator - Terminating" - << std::endl; + std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: " + "Insufficient shmem " + "available for key for hash map accumulator - Terminating" + << std::endl; std::cout << " team_shmem_key_size = " << team_shmem_key_size << std::endl; throw std::runtime_error( - " KokkosSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " "available for key for hash map accumulator "); } Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY", - gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + gpu_team_policy_t(this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } MyExecSpace().fence(); } else { if (algorithm_to_run == SPGEMM_KK_LP) { - if (use_dynamic_schedule) { + if (Base::use_dynamic_schedule) { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC", dynamic_multicore_team_policy4_t( - a_row_cnt / team_row_chunk_size + 1, + this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } else { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC", multicore_team_policy4_t( - a_row_cnt / team_row_chunk_size + 1, + this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } } else { - if (use_dynamic_schedule) { + if (Base::use_dynamic_schedule) { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC", dynamic_multicore_team_policy_t( - a_row_cnt / team_row_chunk_size + 1, + this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } else { Kokkos::parallel_for( "KOKKOSPARSE::SPGEMM::KKMEM::STATIC", - multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + multicore_team_policy_t(this->a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } @@ -1723,11 +1714,12 @@ void KokkosSPGEMMKOKKOSKERNELS_VERBOSE) { std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; } } +#if 0 // 01/30/2020: this code seems to be unused within any of the kokkos-kernels // spgemm numeric phase algorithms // TODO determine if this code should be revived for use or removed @@ -1850,6 +1842,7 @@ void KokkosSPGEMM -struct KokkosSPGEMM::NumericCMEM_CPU { +struct KokkosBSPGEMM::NumericCMEM_CPU { + using BSPGEMM = KokkosBSPGEMM; + nnz_lno_t numrows; nnz_lno_t numcols; + nnz_lno_t block_dim; + nnz_lno_t block_size; a_row_view_t row_mapA; a_nnz_view_t entriesA; @@ -81,8 +87,9 @@ struct KokkosSPGEMM -struct KokkosSPGEMM::NumericCMEM { + static constexpr auto scalarAlignPad = + KokkosBSPGEMM::NumericCMEM { + b_scalar_nnz_view_t_>::scalarAlignPad; + nnz_lno_t numrows; + nnz_lno_t block_dim; + nnz_lno_t block_size; a_row_view_t__ row_mapA; a_nnz_view_t__ entriesA; @@ -231,8 +249,8 @@ struct KokkosSPGEMM alignof(nnz_lno_t)) - ? (alignof(scalar_t) - alignof(nnz_lno_t)) - : 0; shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / unit_memory); if (KOKKOSKERNELS_VERBOSE_) { @@ -291,10 +308,9 @@ struct KokkosSPGEMM> 1) << 1; if (KOKKOSKERNELS_VERBOSE_) { @@ -334,20 +350,21 @@ struct KokkosSPGEMM(all_shared_memory); - KokkosKernels::Experimental::HashmapAccumulator< + KokkosKernels::Experimental::BlockHashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, KokkosKernels::Experimental::HashOpType::bitwiseAnd> - hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); + hm(block_dim, shmem_key_size, shared_memory_hash_func, begins, nexts, + keys, vals); // issue-508, TODO: understand and re-work below parallel_for loop. // Inialize hm2 with correct max_value_size and hashOpRHS // global_memory_hash_size is computed, per team of threads -- this is // hashOpRHS. - KokkosKernels::Experimental::HashmapAccumulator< + KokkosKernels::Experimental::BlockHashmapAccumulator< nnz_lno_t, nnz_lno_t, scalar_t, KokkosKernels::Experimental::HashOpType::modulo> - hm2(0, 0, NULL, NULL, NULL, NULL); + hm2(block_dim, 0, 0, NULL, NULL, NULL, NULL); /* KokkosKernels::Experimental::HashmapAccumulator hm2(global_memory_hash_size, global_memory_hash_size, @@ -363,7 +380,7 @@ struct KokkosSPGEMMspgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == // this->spgemm_algorithm) : -// call KokkosSPGEMM_numeric_speed(...) +// call KokkosBSPGEMM_numeric_speed(...) // else: -// call KokkosSPGEMM_numeric_hash(...) +// call KokkosBSPGEMM_numeric_hash(...) // // -// KokkosSPGEMM_numeric_speed: +// KokkosBSPGEMM_numeric_speed: // // Algorithm selection as follows and matching to kernel Tag: // @@ -489,19 +508,19 @@ template template -void KokkosSPGEMM:: - KokkosSPGEMM_numeric_speed( +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric_speed( c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType my_exec_space_) { - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\tSPEED MODE" << std::endl; } - nnz_lno_t brows = row_mapB.extent(0) - 1; - size_type bnnz = valsB.extent(0); + nnz_lno_t brows = this->row_mapB.extent(0) - 1; + size_type bnnz = this->valsB.extent(0); // get suggested vector size, teamsize and row chunk size. int suggested_vector_size = @@ -509,7 +528,7 @@ void KokkosSPGEMMhandle->get_suggested_team_size(suggested_vector_size); nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, concurrency, a_row_cnt); + suggested_team_size, this->concurrency, Base::a_row_cnt); Kokkos::Timer numeric_speed_timer_with_free; @@ -529,19 +548,19 @@ void KokkosSPGEMM - sc(a_row_cnt, row_mapA, entriesA, valsA, - - row_mapB, entriesB, valsB, + sc(Base::a_row_cnt, block_dim, this->row_mapA, this->entriesA, + this->valsA, this->row_mapB, this->entriesB, this->valsB, rowmapC_, entriesC_, valuesC_, - beginsC, nextsC, shmem_size, suggested_vector_size, - team_row_chunk_size, suggested_team_size, KOKKOSKERNELS_VERBOSE); + beginsC, nextsC, this->shmem_size, suggested_vector_size, + team_row_chunk_size, suggested_team_size, + Base::KOKKOSKERNELS_VERBOSE); Kokkos::Timer timer1; MyExecSpace().fence(); - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tGPU vector_size:" << suggested_vector_size << " team_size:" << suggested_team_size << " chunk_size:" << team_row_chunk_size << std::endl; @@ -552,12 +571,12 @@ void KokkosSPGEMMconcurrency; Kokkos::Timer timer1; - pool_memory_space m_space( - num_chunks, this->b_col_cnt + (this->b_col_cnt) / sizeof(scalar_t) + 1, - 0, my_pool_type); + const size_t chunk_size = this->b_col_cnt * block_dim * block_dim + + this->b_col_cnt / sizeof(scalar_t) + 1; + pool_memory_space m_space(num_chunks, chunk_size, 0, my_pool_type); MyExecSpace().fence(); - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; std::cout << "\tPool Size(MB):" - << sizeof(scalar_t) * - (num_chunks * - (this->b_col_cnt + - (this->b_col_cnt) / sizeof(scalar_t) + 1)) / - 1024. / 1024. + << sizeof(scalar_t) * (num_chunks * chunk_size) / 1024. / 1024. << std::endl; } @@ -591,44 +606,44 @@ void KokkosSPGEMM - sc(a_row_cnt, b_col_cnt, row_mapA, entriesA, valsA, - - row_mapB, entriesB, valsB, + sc(Base::a_row_cnt, this->b_col_cnt, block_dim, this->row_mapA, + this->entriesA, this->valsA, this->row_mapB, this->entriesB, + this->valsB, rowmapC_, entriesC_, valuesC_, m_space, my_exec_space_, team_row_chunk_size); MyExecSpace().fence(); - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tCPU vector_size:" << suggested_vector_size << " team_size:" << suggested_team_size << " chunk_size:" << team_row_chunk_size << std::endl; } timer1.reset(); - if (use_dynamic_schedule) { + if (this->use_dynamic_schedule) { Kokkos::parallel_for("KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC", dynamic_multicore_team_policy_t( - a_row_cnt / team_row_chunk_size + 1, + Base::a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } else { Kokkos::parallel_for( "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC", - multicore_team_policy_t(a_row_cnt / team_row_chunk_size + 1, + multicore_team_policy_t(Base::a_row_cnt / team_row_chunk_size + 1, suggested_team_size, suggested_vector_size), sc); } MyExecSpace().fence(); - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds() << std::endl; } } - if (KOKKOSKERNELS_VERBOSE) { + if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tNumeric SPEED TIME WITH FREE:" << numeric_speed_timer_with_free.seconds() << std::endl; } diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 658b2a1303..06ac4b5aaa 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -54,7 +54,7 @@ //#include "KokkosSparse_spgemm_symbolic.hpp" #include "KokkosSparse_spgemm_cuSPARSE_impl.hpp" #include "KokkosSparse_spgemm_CUSP_impl.hpp" -#include "KokkosSparse_spgemm_impl.hpp" +#include "KokkosSparse_bspgemm_impl.hpp" #include "KokkosSparse_bspgemm_impl_seq.hpp" #include "KokkosSparse_spgemm_mkl_impl.hpp" #include "KokkosSparse_spgemm_mkl2phase_impl.hpp" @@ -278,12 +278,12 @@ struct BSPGEMM_NUMERIC< default: { - KokkosSPGEMM - kspgemm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, - row_mapB, entriesB, valuesB, transposeB); - kspgemm.KokkosSPGEMM_numeric(row_mapC, entriesC, valuesC); + KokkosBSPGEMM + kbspgemm(handle, m, n, k, blockDim, row_mapA, entriesA, valuesA, + transposeA, row_mapB, entriesB, valuesB, transposeB); + kbspgemm.KokkosBSPGEMM_numeric(row_mapC, entriesC, valuesC); } break; case SPGEMM_SERIAL: case SPGEMM_DEBUG: diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 09a8bf212a..dd6aa19625 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -282,7 +282,7 @@ class KokkosSPGEMM { typedef Kokkos::TeamPolicy > dynamic_team_policy_t; - private: + protected: HandleType *handle; nnz_lno_t a_row_cnt; nnz_lno_t b_row_cnt; @@ -795,7 +795,7 @@ class KokkosSPGEMM { typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space); - private: + protected: template void KokkosSPGEMM_jacobi_denseacc( From dbb1f11950993e0424c8172d18f988c887fcb1ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 28 Mar 2022 20:48:43 +0200 Subject: [PATCH 096/261] fix kk_vector_block_add_mul() naming --- src/common/KokkosKernels_BlockUtils.hpp | 2 +- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 24 +++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp index c6f9f55e3e..30a46f36ec 100644 --- a/src/common/KokkosKernels_BlockUtils.hpp +++ b/src/common/KokkosKernels_BlockUtils.hpp @@ -119,7 +119,7 @@ KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, // Performs C += A * B (dense GEMM) on blocks // Note: all pointers reference dense row-major blocks (no extra padding) template -KOKKOS_INLINE_FUNCTION void kk_vector_block_mul_add(const size_type block_dim, +KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, value_type *dst, const value_type *valA, const value_type *valB) { diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 69d932d6f9..25bcd68e72 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -810,7 +810,7 @@ struct KokkosBSPGEMM max_first_level_hash_size) @@ -837,7 +837,7 @@ struct KokkosBSPGEMM max_first_level_hash_size) @@ -864,7 +864,7 @@ struct KokkosBSPGEMM Date: Mon, 28 Mar 2022 21:20:32 +0200 Subject: [PATCH 097/261] clean up unused macros --- src/sparse/impl/KokkosSparse_bspgemm_impl.hpp | 9 --------- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 8 -------- 2 files changed, 17 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp index d015778ca1..fd6d07cf2c 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -45,15 +45,6 @@ #ifndef _KOKKOSBSPGEMMIMPL_HPP #define _KOKKOSBSPGEMMIMPL_HPP -//#define KOKKOSKERNELS_ANALYZE_COMPRESSION -//#define KOKKOSKERNELS_ANALYZE_MEMORYACCESS -//#define HASHTRACK - -//#define TRACK_INSERTS -//#define GPU_EXPERIMENTAL -//#define NUMERIC_USE_STATICMEM -//#define twostep - #include "KokkosSparse_spgemm_impl.hpp" namespace KokkosSparse { diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index dd6aa19625..9b4c28c877 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -45,14 +45,6 @@ #ifndef _KOKKOSSPGEMMIMPL_HPP #define _KOKKOSSPGEMMIMPL_HPP -//#define KOKKOSKERNELS_ANALYZE_COMPRESSION -//#define KOKKOSKERNELS_ANALYZE_MEMORYACCESS -//#define HASHTRACK - -//#define TRACK_INSERTS -//#define GPU_EXPERIMENTAL -//#define NUMERIC_USE_STATICMEM -//#define twostep #include #include #include From 433f69c946e32afce5781461d253980878ed057d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 28 Mar 2022 21:47:04 +0200 Subject: [PATCH 098/261] refactor view indexing --- .../impl/KokkosSparse_bspgemm_impl_seq.hpp | 30 +++++++++---------- .../impl/KokkosSparse_spgemm_impl_seq.hpp | 30 +++++++++---------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp index 7862268082..f9575322a8 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp @@ -111,17 +111,17 @@ void spgemm_debug_symbolic(KernelHandle *handle, lno_t row_size = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - // scalar_t val = h_vala(ind); + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + // scalar_t val = h_vala(a_ind); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - // scalar_t b_val = h_valb(ind_); + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + // scalar_t b_val = h_valb(b_ind); // if (i == 0) std::cout << "\tb col:" << b_col << std::endl; if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -221,16 +221,16 @@ void bspgemm_debug_numeric(KernelHandle* /* handle */, lno_t c_row_size_counter = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - auto a_val = h_vala.data() + ind * block_size; // valuesA(i, col) + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + auto a_val = &h_vala(a_ind * block_size); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - auto b_val = h_valb.data() + ind_ * block_size; // valuesB(col, b_col) + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + auto b_val = &h_valb(b_ind * block_size); if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -246,10 +246,10 @@ void bspgemm_debug_numeric(KernelHandle* /* handle */, // if (i == 0) std::cout << "result_cols" << std::endl; for (lno_t j = 0; j < c_row_size; ++j) { - size_type ind = c_row_begin + j; - lno_t result_col = h_entc(ind); + size_type c_ind = c_row_begin + j; + lno_t result_col = h_entc(c_ind); auto acc = get_block(accumulator, result_col, block_size); - Kokkos::deep_copy(get_block(h_valc, ind, block_size), acc); + Kokkos::deep_copy(get_block(h_valc, c_ind, block_size), acc); Kokkos::deep_copy(acc, ZERO); Kokkos::fence(); acc_flag[result_col] = false; diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp index ce3501c447..32492482fe 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp @@ -95,17 +95,17 @@ void spgemm_debug_symbolic(KernelHandle *handle, lno_t row_size = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - // scalar_t val = h_vala(ind); + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + // scalar_t val = h_vala(a_ind); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - // scalar_t b_val = h_valb(ind_); + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + // scalar_t b_val = h_valb(b_ind); // if (i == 0) std::cout << "\tb col:" << b_col << std::endl; if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -194,16 +194,16 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, lno_t c_row_size_counter = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - scalar_t val = h_vala(ind); + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + scalar_t val = h_vala(a_ind); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - scalar_t b_val = h_valb(ind_); + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + scalar_t b_val = h_valb(b_ind); if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -216,9 +216,9 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, // if (i == 0) std::cout << "result_cols" << std::endl; for (lno_t j = 0; j < c_row_size; ++j) { - size_type ind = c_row_begin + j; - lno_t result_col = h_entc(ind); - h_valc(ind) = accumulator[result_col]; + size_type c_ind = c_row_begin + j; + lno_t result_col = h_entc(c_ind); + h_valc(c_ind) = accumulator[result_col]; accumulator[result_col] = 0; acc_flag[result_col] = false; } From bd1e495ed621d4b79405f36a0c8d5e09e8c33221 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 29 Mar 2022 01:13:07 +0200 Subject: [PATCH 099/261] fix literal max int --- unit_test/sparse/Test_Sparse_bspgemm.hpp | 2 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp index 4463eba503..fa425b86b8 100644 --- a/unit_test/sparse/Test_Sparse_bspgemm.hpp +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -278,7 +278,7 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, #endif for (auto spgemm_algorithm : algorithms) { - const uint64_t max_integer = 2147483647; + const uint64_t max_integer = Kokkos::ArithTraits::max(); std::string algo = "UNKNOWN"; bool is_expected_to_fail = false; diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index 47b06b716a..a7b9432857 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -287,7 +287,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, #endif for (auto spgemm_algorithm : algorithms) { - const uint64_t max_integer = 2147483647; + const uint64_t max_integer = Kokkos::ArithTraits::max(); std::string algo = "UNKNOWN"; bool is_expected_to_fail = false; From dd4b02c21425ac31a096742f3a65b1f22f59ca77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 29 Mar 2022 01:31:05 +0200 Subject: [PATCH 100/261] fix row_size naming --- src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp | 4 ++-- src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index 54217fef41..507511ef85 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -165,9 +165,9 @@ struct KokkosBSPGEMM Date: Tue, 29 Mar 2022 13:52:25 +0200 Subject: [PATCH 101/261] clean outdated comment --- src/common/KokkosKernels_BlockHashmapAccumulator.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp index 1777189612..ed77b08f3a 100644 --- a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -543,8 +543,6 @@ struct BlockHashmapAccumulator { } } - // NOTE: this is an exact copy of vector_atmoic_insert_into_hash_mergeAdd from - // https://github.com/kokkos/kokkos-kernels/blob/750fe24508a69ed4dba92bb4a9e17a6094b1a083/src/common/KokkosKernels_HashmapAccumulator.hpp#L442-L502 template KOKKOS_INLINE_FUNCTION int vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( From 3514cb3858dbe3779bf8340eb336cdf61a758d6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 29 Mar 2022 13:53:01 +0200 Subject: [PATCH 102/261] refactor unused return value --- src/common/KokkosKernels_BlockHashmapAccumulator.hpp | 7 +++---- src/common/KokkosKernels_HashmapAccumulator.hpp | 7 +++---- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp index ed77b08f3a..69d3fd13bc 100644 --- a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -357,13 +357,13 @@ struct BlockHashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAdd_TrackHashes( + void sequential_insert_into_hash_mergeAdd_TrackHashes( key_type key, const value_type *valueA, const value_type *valueB, size_type *used_size_, size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; - if (key == -1) return __insert_success; + if (key == -1) return; // issue-508, TODO: ensure that i < __max_value_size, but // need information about length of keys, values, and hash_nexts first! @@ -372,7 +372,7 @@ struct BlockHashmapAccumulator { if (keys[i] == key) { KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, valueA, valueB); - return __insert_success; + return; } } @@ -387,7 +387,6 @@ struct BlockHashmapAccumulator { keys[my_index] = key; KokkosSparse::Impl::kk_block_set_mul( block_dim, values + my_index * block_size, valueA, valueB); - return __insert_success; } // Performs C[hash] += A * B (for existing entry) diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index b7f39f75c2..c6397fd9ea 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -344,12 +344,12 @@ struct HashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAdd_TrackHashes( + void sequential_insert_into_hash_mergeAdd_TrackHashes( key_type key, value_type value, size_type *used_size_, size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; - if (key == -1) return __insert_success; + if (key == -1) return; // issue-508, TODO: ensure that i < __max_value_size, but // need information about length of keys, values, and hash_nexts first! @@ -357,7 +357,7 @@ struct HashmapAccumulator { for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { values[i] = values[i] + value; - return __insert_success; + return; } } @@ -371,7 +371,6 @@ struct HashmapAccumulator { hash_begins[hash] = my_index; keys[my_index] = key; values[my_index] = value; - return __insert_success; } // no values. simply adds to the keys. From 682a175b483e8c95c64ed3e5e678a605b95c1259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 29 Mar 2022 13:56:58 +0200 Subject: [PATCH 103/261] clean up unused code --- .../impl/KokkosSparse_bspgemm_numeric_spec.hpp | 17 ----------------- .../impl/KokkosSparse_spgemm_numeric_spec.hpp | 17 ----------------- 2 files changed, 34 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 06ac4b5aaa..075080a45b 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -220,23 +220,6 @@ struct BSPGEMM_NUMERIC< if (!sh->is_symbolic_called()) { throw std::runtime_error( "Call spgemm symbolic before calling SpGEMM numeric"); - /* - KokkosSparse::Experimental::spgemm_symbolic( - handle, m, n, k, - row_mapA, entriesA, transposeA, - row_mapB, entriesB, transposeB, - row_mapC - ); - typename c_size_view_t_::value_type c_nnz_size = - handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC = - c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); valuesC = c_scalar_view_t - (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - } - */ } switch (sh->get_algorithm_type()) { diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index 0b28d2f02b..24008d3b26 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -211,23 +211,6 @@ struct SPGEMM_NUMERIC< if (!sh->is_symbolic_called()) { throw std::runtime_error( "Call spgemm symbolic before calling SpGEMM numeric"); - /* - KokkosSparse::Experimental::spgemm_symbolic( - handle, m, n, k, - row_mapA, entriesA, transposeA, - row_mapB, entriesB, transposeB, - row_mapC - ); - typename c_size_view_t_::value_type c_nnz_size = - handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC = - c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); valuesC = c_scalar_view_t - (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - } - */ } switch (sh->get_algorithm_type()) { From e380d1b6985f3ad4af7b4c46078b451633626960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Sat, 5 Mar 2022 15:21:00 +0100 Subject: [PATCH 104/261] Clean SpGEMM code not used in block version --- .../KokkosKernels_BlockHashmapAccumulator.hpp | 228 +-------- src/sparse/impl/KokkosSparse_bspgemm_impl.hpp | 444 ------------------ .../impl/KokkosSparse_bspgemm_impl_def.hpp | 214 --------- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 208 -------- .../impl/KokkosSparse_bspgemm_impl_seq.hpp | 84 ---- unit_test/sparse/Test_Sparse_bspgemm.hpp | 132 ------ 6 files changed, 1 insertion(+), 1309 deletions(-) diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp index 69d3fd13bc..576060cf75 100644 --- a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -46,6 +46,7 @@ #include #include #include "KokkosKernels_BlockUtils.hpp" +#include "KokkosKernels_HashmapAccumulator.hpp" //#define HASHMAPACCUMULATOR_ASSERT_ENABLED @@ -53,21 +54,6 @@ namespace KokkosKernels { namespace Experimental { -#if 0 // defined in HashmapAccumulator header - include if needed or drop -/** - * @brief types of hash operations supported by HashmapAccumulator. - * - * /var bitwiseAnd: Performs key & hashOpRHS - * /var modulo: Performs key % hashOpRHS - * /var pow2Modulo: Performs key & (hashOpRHS - 1) - */ -struct HashOpType { - struct bitwiseAnd {}; - struct modulo {}; - struct pow2Modulo {}; -}; -#endif - template /** @@ -173,185 +159,6 @@ struct BlockHashmapAccumulator { } } -#if 0 // not used in block SPGEMM - // function to be called from device. - // Accumulation is OR operation. - // Insertion is sequential, no race condition for the insertion. - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TrackHashes(key_type key, - value_type value, - size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { - size_type hash, i, my_index; - - if (key == -1) return __insert_success; - - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - values[i] = values[i] | value; - return __insert_success; - } - } - - if (*used_size_ >= __max_value_size) return __insert_full; - my_index = (*used_size_)++; - - if (hash_begins[hash] == -1) { - used_hashes[used_hash_size[0]++] = hash; - } - hash_nexts[my_index] = hash_begins[hash]; - - hash_begins[hash] = my_index; - keys[my_index] = key; - values[my_index] = value; - return __insert_success; - } - - // function to be called from device. - // Accumulation is OR operation. - // TODO: This function is for triangle counting. - // Assume that there are 2 values for triangle count. - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeOr_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { - size_type hash, i, my_index; - - if (key == -1) return __insert_success; - - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - values2[i] = values2[i] | (values[i] & value); - values[i] = values[i] | value; - return __insert_success; - } - } - - if (*used_size_ >= __max_value_size) return __insert_full; - my_index = (*used_size_)++; - - if (hash_begins[hash] == -1) { - used_hashes[used_hash_size[0]++] = hash; - } - hash_nexts[my_index] = hash_begins[hash]; - - hash_begins[hash] = my_index; - keys[my_index] = key; - values[my_index] = value; - values2[my_index] = 0; - return __insert_success; - } - - // this is used in slow triangle counting method. - // L x Incidence - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type * /*used_size_*/, size_type * /*used_hash_size*/, - size_type * /*used_hashes*/) { - size_type hash, i; - - if (key == -1) return __insert_success; - - // this function will only try to do an AND operation with - // existing keys. If the key is not there, returns __insert_full. - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - // values2[i] = values2[i] | (values[i] & value); - values[i] = values[i] & value; - ++values2[i]; - return __insert_success; - } - } - return __insert_full; - } - - // this is used in LxL or Incidence^T x L - KOKKOS_INLINE_FUNCTION - value_type sequential_insert_into_hash_mergeAnd_TriangleCount_TrackHashes( - key_type key, value_type value) { - size_type hash, i; - - if (key == -1) return __insert_success; - - // this function will only try to do an AND operation with - // existing keys. If the key is not there, returns __insert_full. - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - return values[i] & value; - } - } - return 0; - } - - // this is used in slow triangle counting method. - // L x Incidence - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, value_type *values2, - size_type *used_size_, size_type *used_hash_size, - size_type *used_hashes) { - size_type hash, my_index; - - if (key == -1) return __insert_success; - - // this function will directly insert, won't check if it exists already. - if (*used_size_ >= __max_value_size) return __insert_full; - my_index = (*used_size_)++; - - keys[my_index] = key; - values[my_index] = value; - values2[my_index] = 1; - - hash = __compute_hash(key, __hashOpRHS); - if (hash_begins[hash] == -1) { - hash_begins[hash] = my_index; - used_hashes[used_hash_size[0]++] = hash; - } else { - hash_nexts[my_index] = hash_begins[hash]; - hash_begins[hash] = my_index; - } - return __insert_success; - } - - // this is used in LxL or Incidence^T x L - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TriangleCount_TrackHashes( - key_type key, value_type value, size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) // issue-508, TODO figure out what this - // "used_hashes" is for - { - size_type hash, my_index; - - if (key == -1) return __insert_success; - - // this function will directly insert, won't check if it exists already. - if (*used_size_ >= __max_value_size) return __insert_full; - my_index = (*used_size_)++; - - keys[my_index] = key; - values[my_index] = value; - - hash = __compute_hash(key, __hashOpRHS); - if (hash_begins[hash] == -1) { - hash_begins[hash] = my_index; - used_hashes[used_hash_size[0]++] = hash; - } else { - hash_nexts[my_index] = hash_begins[hash]; - hash_begins[hash] = my_index; - } - return __insert_success; - } - -#endif - // Performs C[hash] += A * B (for existing entry) // or C[hash] = A * B (for new entry) // Insertion is sequential, no race condition for the insertion. @@ -431,39 +238,6 @@ struct BlockHashmapAccumulator { } } -#if 0 - // no values. simply adds to the keys. - // used in the compression to count the sets. - // also used in the symbolic of spgemm if no compression is applied. - KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_TrackHashes(key_type key, - size_type *used_size_, - size_type *used_hash_size, - size_type *used_hashes) { - size_type hash, i, my_index; - - if (key == -1) return __insert_success; - - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - return __insert_success; - } - } - - my_index = (*used_size_)++; - - if (hash_begins[hash] == -1) { - used_hashes[used_hash_size[0]++] = hash; - } - hash_nexts[my_index] = hash_begins[hash]; - - hash_begins[hash] = my_index; - keys[my_index] = key; - return __insert_success; - } -#endif - // used in the kkmem's numeric phase for second level hashmaps. // function to be called from device. // Accumulation is Add operation. It is not atomicAdd, as this diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp index fd6d07cf2c..7b003229ab 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -94,118 +94,6 @@ class KokkosBSPGEMM USE_BASE_TYPE(multicore_team_policy_t) USE_BASE_TYPE(multicore_team_policy4_t) -#if 0 // defined in base class (clean up or implement block version) - private: - HandleType *handle; - nnz_lno_t a_row_cnt; - nnz_lno_t b_row_cnt; - nnz_lno_t b_col_cnt; - - const_a_lno_row_view_t row_mapA; - const_a_lno_nnz_view_t entriesA; - const_a_scalar_nnz_view_t valsA; - bool transposeA; - - const_b_lno_row_view_t row_mapB; - const_b_lno_nnz_view_t entriesB; - const_b_scalar_nnz_view_t valsB; - bool transposeB; - - const size_t shmem_size; - size_t concurrency; - const bool use_dynamic_schedule; - const bool KOKKOSKERNELS_VERBOSE; - // const int KOKKOSKERNELS_VERBOSE = 1; - - const KokkosKernels::Impl::ExecSpaceType MyEnumExecSpace; - const SPGEMMAlgorithm spgemm_algorithm; - const SPGEMMAccumulator spgemm_accumulator; - - ////////////////////////////////////////////////////////////////////////////// - //////Function and Struct for matrix compression. - //////Declerations are at KokkosKernels_SPGEMM_impl_compression.hpp - ////////////////////////////////////////////////////////////////////////////// - - /** - * \brief Given a symbolic matrix (a graph), it compresses the graph using - * bits. \param in_row_map: input row pointers. \param in_entries: input - * column entries \param out_row_map: output row pointers of the compressed - * matrix \param out_nnz_indices: output, column set indices of the output - * matrix. \param out_nnz_sets: output, column sets of the output matrix. - * - */ - template - bool compressMatrix(nnz_lno_t n, size_type nnz, in_row_view_t in_row_map, - in_nnz_view_t in_entries, out_rowmap_view_t out_row_map, - out_nnz_view_t &out_nnz_indices, - out_nnz_view_t &out_nnz_sets, bool singleStep); - - public: - /** - *\brief Functor to zip the B matrix. - */ - template - struct SingleStepZipMatrix; - - private: - ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - ////BELOW code is for triangle count specific. - ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - template - void triangle_count_ai(const int is_symbolic_or_numeric, const nnz_lno_t m, - const size_type *row_mapA_, const nnz_lno_t *entriesA_, - - const size_type bnnz, const size_type *old_row_mapB, - const size_type *row_mapB_, - const nnz_lno_t *entriesSetIndex, - const nnz_lno_t *entriesSets, - - size_type *rowmapC, nnz_lno_t *entriesC, - struct_visit_t visit_applier); - - public: - template - struct TriangleCount; - - template - void KokkosSPGEMM_numeric_triangle(c_row_view_t rowmapC_, - c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_); - - template - void KokkosSPGEMM_symbolic_triangle(c_row_view_t rowmapC_); - template - void KokkosSPGEMM_generic_triangle(visit_struct_t visit_apply); - - /* - template - void KokkosSPGEMM_generic_triangle_no_compression(visit_struct_t visit_apply); - - template - void triangle_count_ai_no_compression( - const nnz_lno_t m, - const size_type* row_mapA_, - const nnz_lno_t * entriesA_, - - const size_type bnnz, - const size_type * rowmapB_begins, - const size_type * rowmapB_ends, - const nnz_lno_t * entriesB, - struct_visit_t visit_applier); - */ - void KokkosSPGEMM_symbolic_triangle_setup(); - - private: - template - void KokkosSPGEMM_numeric_triangle_ai(c_row_view_t rowmapC_, - c_lno_nnz_view_t entriesC_); -#endif - public: ////////////////////////////////////////////////////////////////////////// /////BELOW CODE IS TO for SPEED SPGEMM @@ -236,44 +124,6 @@ class KokkosBSPGEMM c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType my_exec_space); -#if 0 - public: - /* - ////////////////////////////////////////////////////////////////////////// - /////BELOW CODE IS TO for colored SPGEMM - ////DECL IS AT _color.hpp - ////////////////////////////////////////////////////////////////////////// - template struct NumericCCOLOR; - */ - private: - /** - * \brief Numeric phase with speed method - */ - /* - template void KokkosSPGEMM_numeric_color( c_row_view_t rowmapC_, - c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_, - SPGEMMAlgorithm spgemm_algorithm); - - template - void d2_color_c_matrix( - c_row_view_t rowmapC, - c_nnz_view_t entryIndicesC_, - - nnz_lno_t &original_num_colors, - nnz_lno_persistent_work_host_view_t &h_color_xadj, - nnz_lno_persistent_work_view_t &color_adj, - nnz_lno_persistent_work_view_t &vertex_colors_to_store, - - nnz_lno_t &num_colors_in_one_step, - nnz_lno_t &num_multi_color_steps, - SPGEMMAlgorithm spgemm_algorithm); - */ -#endif private: // How many extra bytes are needed to align a scalar_t after an array of // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per @@ -308,116 +158,6 @@ class KokkosBSPGEMM c_scalar_nnz_view_t valuesC_, KokkosKernels::Impl::ExecSpaceType my_exec_space); -#if 0 // defined in base class (clean up or implement block version) - template - void KokkosSPGEMM_numeric_hash( - c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space); -#if defined(KOKKOS_ENABLE_OPENMP) -#ifdef KOKKOSKERNELS_HAVE_OUTER - public: - // OUTER PRODUCT CODES - struct Triplet; - - template - struct OuterProduct; - - template - struct FlopsPerRowOuter; - - private: - template - void sort_triplets(triplet_view_t triplets, size_t num_triplets); - - template - void merge_triplets_on_slow_memory(host_triplet_view_t *triplets, - size_t num_blocks, size_t overall_size, - host_triplet_view_t output_triplets); - - template - size_t final_collapse_triplets_omp(triplet_view_t triplets, - size_t num_triplets, - c_row_view_t &rowmapC_, - c_lno_nnz_view_t &entriesC_, - c_scalar_nnz_view_t &valuesC_); - - template - size_t collapse_triplets(triplet_view_t triplets, size_t num_triplets); - - template - size_t collapse_triplets_omp(triplet_view_t triplets, size_t num_triplets, - triplet_view_t out_triplets); - -#endif -#endif - - template - void KokkosSPGEMM_numeric_outer( - c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, - c_scalar_nnz_view_t &valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space); - ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - -#ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS - ////////////////////////////////////////////////////////////////////////// - /////BELOW CODE IS TO CALCULATE MEMORY ACCESSES WITH HYPERGRAPH MODEL///// - ////DECL IS AT _memaccess.hpp - ////////////////////////////////////////////////////////////////////////// - public: - // Functor to calculate how many flops is performed per row of C. - template - struct FlopsPerRow; - struct Cache; - - private: - void create_read_write_hg(size_t &overall_flops, - row_lno_temp_work_view_t &c_flop_rowmap, - row_lno_temp_work_view_t &c_comp_a_net_index, - row_lno_temp_work_view_t &c_comp_b_net_index, - nnz_lno_temp_work_view_t &c_comp_row_index, - nnz_lno_temp_work_view_t &c_comp_col_index); - - template - void print_read_write_cost(c_row_view_t rowmapC); - - template - void read_write_cost( - nnz_lno_t num_colors, nnz_lno_t num_multi_colors, - nnz_lno_t num_parallel_colors, bool isGPU, int num_cores, - - nnz_lno_t num_hyperthreads_in_core, nnz_lno_t hyper_threads_in_team, - - int vectorlane, const int cache_line_size, const int data_size, - const int cache_size, - - nnz_lno_persistent_work_host_view_t color_xadj, - typename nnz_lno_persistent_work_view_t::HostMirror color_adj, - typename nnz_lno_persistent_work_view_t::HostMirror vertex_colors, - - size_t overall_flops, - typename row_lno_temp_work_view_t::HostMirror c_flop_rowmap, - typename row_lno_temp_work_view_t::HostMirror c_comp_a_net_index, - typename row_lno_temp_work_view_t::HostMirror c_comp_b_net_index, - typename nnz_lno_temp_work_view_t::HostMirror c_comp_row_index, - typename nnz_lno_temp_work_view_t::HostMirror c_comp_col_index, - c_row_view_t rowmapC, - int write_type // 0 -- KKMEM, 1-KKSPEED, 2- KKCOLOR 3-KKMULTICOLOR - // 4-KKMULTICOLOR2 - ); - -#endif -#endif - public: ////////////////////////////////////////////////////////////////////////// /////BELOW CODE IS for public symbolic and numeric functions @@ -428,25 +168,6 @@ class KokkosBSPGEMM void KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, c_scalar_nnz_view_t &valuesC_); - // TODO: These are references only for outer product algorithm. - // If the algorithm is removed, then remove the references. - -#if 0 - /** - * \brief Symbolic phase of the SPGEMM. - * \param rowmapC_: row pointers for the result matrix. Allocated before the - * call with size (n+1), where n is the number of rows of first matrix. - */ - template - void KokkosSPGEMM_symbolic(c_row_view_t rowmapC_); - - template - void write_matrix_to_plot(nnz_lno_t &num_colors, - nnz_lno_persistent_work_host_view_t &h_color_xadj, - nnz_lno_persistent_work_view_t &color_adj, - c_row_view_t &rowmapC, - c_nnz_view_t &entryIndicesC_); -#endif KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_, @@ -467,171 +188,6 @@ class KokkosBSPGEMM : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, valsA_, transposeA_, row_mapB_, entriesB_, valsB_, transposeB_), block_dim(block_dim_) {} - -#if 0 // defined in base class (clean up or implement block version) - ////////////////////////////////////////////////////////////////////////// - /////BELOW CODE IS for symbolic phase - ////DECL IS AT _symbolic.hpp - ////////////////////////////////////////////////////////////////////////// - public: - /*** - * \brief Functor to calculate the row sizes of C. - */ - template - struct StructureC; - - template - struct StructureC_NC; - - template - struct NonzeroesC; - - /** - * \brief Functor to calculate the max flops in a row of SPGEMM. - * - */ - template - struct PredicMaxRowNNZ; - - struct PredicMaxRowNNZIntersection; - struct PredicMaxRowNNZ_p; - - private: - /** - * \brief function return max flops for a row in the result multiplication. - * \param m: number of rows in A - * \param row_mapA: row pointers of A. - * \param entriesA: column indices of A - * \param row_pointers_begin_B: beginning of the row indices for B - * \param row_pointers_end_B: end of the row indices for B - */ - template - size_t getMaxRoughRowNNZ(nnz_lno_t m, a_row_view_t row_mapA_, - a_nnz_view_t entriesA_, - - b_oldrow_view_t row_pointers_begin_B, - b_row_view_t row_pointers_end_B, - size_type *flops_per_row = NULL); - - size_t getMaxRoughRowNNZ_p(const nnz_lno_t m, const size_type annz, - const size_type *row_mapA_, - const nnz_lno_t *entriesA_, - - const size_type *row_pointers_begin_B, - const size_type *row_pointers_end_B); - - size_t getMaxRoughRowNNZIntersection_p( - const nnz_lno_t m, const size_type annz, const size_type *row_mapA_, - const nnz_lno_t *entriesA_, - - const size_type *row_pointers_begin_B, - const size_type *row_pointers_end_B, - nnz_lno_t *min_result_row_for_each_row); - - template - void symbolic_c(nnz_lno_t m, a_r_view_t row_mapA_, a_nnz_view_t entriesA_, - - b_original_row_view_t old_row_mapB, - b_compressed_row_view_t row_mapB_, - b_nnz_view_t entriesSetIndex, b_nnz_view_t entriesSets, - - c_row_view_t rowmapC, nnz_lno_t maxNumRoughNonzeros); - - template - void symbolic_c_no_compression(nnz_lno_t m, a_r_view_t row_mapA_, - a_nnz_view_t entriesA_, - - b_original_row_view_t b_rowmap_begin, - b_compressed_row_view_t b_rowmap_end, - b_nnz_view_t entriesb_, c_row_view_t rowmapC, - nnz_lno_t maxNumRoughNonzeros); - - ////////////////////////////////////////////////////////////////////////// - ///// Jacobi-fused SpGEMM declarations - ////////////////////////////////////////////////////////////////////////// - public: - template < - typename a_row_view_t, typename a_nnz_view_t, typename a_scalar_view_t, - typename b_row_view_t, typename b_nnz_view_t, typename b_scalar_view_t, - typename c_row_view_t, typename c_nnz_view_t, typename c_scalar_view_t, - typename dinv_view_t, typename pool_memory_type> - struct JacobiSpGEMMSparseAcc; - - template - struct JacobiSpGEMMDenseAcc; - - template - void KokkosSPGEMM_jacobi_sparseacc( - c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_, - typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, - KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space); - - private: - template - void KokkosSPGEMM_jacobi_denseacc( - c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_, - typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, - KokkosKernels::Impl::ExecSpaceType my_exec_space); - - // Utility to compute the number of pool chunks for L2 hashmap accumulators. - // Uses free memory query for accelerators/GPUs but assumes infinite available - // host memory. - // - // chunk_bytes: bytes in each chunk - // ideal_num_chunks: number of chunks that would give each thread/team its own - // chunk (no contention) - template - size_t compute_num_pool_chunks(size_t chunk_bytes, size_t ideal_num_chunks) { - if (!KokkosKernels::Impl::kk_is_gpu_exec_space< - typename Pool::execution_space>()) - return ideal_num_chunks; - size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory( - free_byte, total_byte); - size_t required_size = ideal_num_chunks * chunk_bytes; - if (KOKKOSKERNELS_VERBOSE) - std::cout << "\tmempool required size:" << required_size - << " free_byte:" << free_byte << " total_byte:" << total_byte - << std::endl; - size_t num_chunks = ideal_num_chunks; - // If there is not enough memory to safely allocate ideal_num_chunks, use - // half the free memory, rounded down - if (required_size > free_byte / 2) { - num_chunks = (free_byte / 2) / chunk_bytes; - } - // then take the largest power of 2 smaller than that - size_t po2_num_chunks = 1; - while (po2_num_chunks * 2 < num_chunks) { - po2_num_chunks *= 2; - } - return po2_num_chunks; - } -#endif }; } // namespace Impl diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp index c4ecbd6503..36729f39ca 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp @@ -77,219 +77,5 @@ void KokkosBSPGEMM -template -void KokkosSPGEMM::KokkosSPGEMM_symbolic(c_row_view_t - rowmapC_) { - { - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "SYMBOLIC PHASE" << std::endl; - } - // first calculate the number of original flops required. - { - nnz_lno_t maxNumRoughZeros = 0; - size_t overall_flops = 0; - Kokkos::Timer timer1; - auto new_row_mapB_begin = - Kokkos::subview(row_mapB, std::make_pair(nnz_lno_t(0), b_row_cnt)); - auto new_row_mapB_end = Kokkos::subview( - row_mapB, std::make_pair(nnz_lno_t(1), b_row_cnt + 1)); - row_lno_persistent_work_view_t flops_per_row( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "original row flops"), - a_row_cnt); - - // get maximum row flops. - maxNumRoughZeros = this->getMaxRoughRowNNZ( - a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, - flops_per_row.data()); - - // calculate overal flops. - KokkosKernels::Impl::kk_reduce_view2( - a_row_cnt, flops_per_row, overall_flops); - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\tOriginal Max Row Flops:" << maxNumRoughZeros - << std::endl; - std::cout << "\tOriginal overall_flops Flops:" << overall_flops - << std::endl; - std::cout << "\ttOriginal Max Row Flop Calc Time:" << timer1.seconds() - << std::endl; - } - this->handle->get_spgemm_handle()->original_max_row_flops = - maxNumRoughZeros; - this->handle->get_spgemm_handle()->original_overall_flops = overall_flops; - this->handle->get_spgemm_handle()->row_flops = flops_per_row; - } - - // number of rows and nnzs - nnz_lno_t n = this->row_mapB.extent(0) - 1; - size_type nnz = this->entriesB.extent(0); - - bool compress_in_single_step = - this->handle->get_spgemm_handle()->get_compression_step(); - // compress in single step if it is GPU. - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) - compress_in_single_step = true; - - // compressed B fields. - row_lno_temp_work_view_t new_row_mapB( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "new row map"), n + 1); - row_lno_temp_work_view_t new_row_mapB_begins; - - nnz_lno_temp_work_view_t - set_index_entries; // will be output of compress matrix. - nnz_lno_temp_work_view_t set_entries; // will be output of compress matrix - - // First Compress B. - Kokkos::Timer timer1; - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\tCOMPRESS MATRIX-B PHASE" << std::endl; - } - - // call compression. - // it might not go through to the end if ratio is not high. - bool compression_applied = this->compressMatrix( - n, nnz, this->row_mapB, this->entriesB, new_row_mapB, set_index_entries, - set_entries, compress_in_single_step); - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\t\tCOMPRESS MATRIX-B overall time:" << timer1.seconds() - << std::endl - << std::endl; - } - - timer1.reset(); - - // first get the max flops for a row, which will be used for max row size. - // If we did compression in single step, row_mapB[i] points the begining of - // row i, and new_row_mapB[i] points to the end of row i. - - if (compression_applied) { - nnz_lno_t maxNumRoughZeros = - this->handle->get_spgemm_handle()->compressed_max_row_flops; - - if (compress_in_single_step) { - // calling symbolic structure - this->symbolic_c(a_row_cnt, row_mapA, entriesA, row_mapB, new_row_mapB, - set_index_entries, set_entries, rowmapC_, - maxNumRoughZeros); - - } else { - nnz_lno_t begin = 0; - auto new_row_mapB_begin = - Kokkos::subview(new_row_mapB, std::make_pair(begin, n)); - auto new_row_mapB_end = - Kokkos::subview(new_row_mapB, std::make_pair(begin + 1, n + 1)); - - // calling symbolic structure - this->symbolic_c(a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, - new_row_mapB_end, set_index_entries, set_entries, - rowmapC_, maxNumRoughZeros); - } - } else { - new_row_mapB = row_lno_temp_work_view_t(); - new_row_mapB_begins = row_lno_temp_work_view_t(); - set_index_entries = nnz_lno_temp_work_view_t(); - set_entries = nnz_lno_temp_work_view_t(); - nnz_lno_t maxNumRoughZeros = - this->handle->get_spgemm_handle()->original_max_row_flops; - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "SYMBOLIC PHASE -- NO COMPRESSION: maxNumRoughZeros:" - << maxNumRoughZeros << std::endl; - } - - auto new_row_mapB_begin = - Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(0), n)); - auto new_row_mapB_end = - Kokkos::subview(this->row_mapB, std::make_pair(nnz_lno_t(1), n + 1)); - - // calling symbolic structure - this->symbolic_c_no_compression( - a_row_cnt, row_mapA, entriesA, new_row_mapB_begin, new_row_mapB_end, - this->entriesB, rowmapC_, maxNumRoughZeros); - } -#ifdef KOKKOSKERNELS_ANALYZE_MEMORYACCESS - double read_write_cost = - this->handle->get_spgemm_handle()->get_read_write_cost_calc(); - if (read_write_cost) { - this->print_read_write_cost(rowmapC_); - } -#endif - } -} - -template -template -void KokkosSPGEMM:: - write_matrix_to_plot(nnz_lno_t &num_colors, - nnz_lno_persistent_work_host_view_t &h_color_xadj, - nnz_lno_persistent_work_view_t &color_adj, - c_row_view_t &rowmapC, c_nnz_view_t &entryIndicesC_) { - std::cout << "writing to plot" << std::endl; - - nnz_lno_persistent_work_host_view_t h_color_adj = - Kokkos::create_mirror_view(color_adj); - Kokkos::deep_copy(h_color_adj, color_adj); - auto h_rowmapC = Kokkos::create_mirror_view(rowmapC); - Kokkos::deep_copy(h_rowmapC, rowmapC); - auto h_entryIndicesC = Kokkos::create_mirror_view(entryIndicesC_); - Kokkos::deep_copy(h_entryIndicesC, entryIndicesC_); - - for (nnz_lno_t i = 0; i < num_colors; ++i) { - nnz_lno_t color_begin = h_color_xadj(i); - nnz_lno_t color_end = h_color_xadj(i + 1); - - std::string colorind = ""; - std::stringstream ss; - ss << i; - - ss >> colorind; - colorind += ".coords"; - std::fstream fs; - fs.open(colorind.c_str(), std::fstream::out); - - std::cout << "COLOR:" << i << " colorbegin:" << color_begin - << " colorend:" << color_end - << " size:" << color_end - color_begin << std::endl; - for (nnz_lno_t j = color_begin; j < color_end; ++j) { - nnz_lno_t row = h_color_adj(j); - for (size_type k = h_rowmapC(row); k < h_rowmapC(row + 1); ++k) { - nnz_lno_t column = h_entryIndicesC(k); - // std::cout << row << " " << column << std::endl; - fs << row << " " << column << std::endl; - } - } - fs.close(); - } - - std::fstream fs; - fs.open("plot1.gnuplot", std::fstream::out); - for (nnz_lno_t i = 0; i < num_colors; ++i) { - std::string colorind = "\""; - std::stringstream ss; - ss << i; - - ss >> colorind; - colorind += ".coords\""; - if (i > 0) fs << "re"; - fs << "plot " << colorind << std::endl; - } - fs << "pause -1" << std::endl; - fs.close(); -} -#endif - } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index 25bcd68e72..a30bbfd170 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -400,89 +400,6 @@ struct KokkosBSPGEMM - hm2(chunk_size, pow2_hash_func, NULL, NULL, NULL, NULL); - - tmp += pow2_hash_size; - - hm2.hash_begins = (nnz_lno_t *)(tmp); - tmp += pow2_hash_size; - hm2.hash_nexts = (nnz_lno_t *)(tmp); - tmp += max_nnz; - - hm2.keys = (nnz_lno_t *)(tmp); - tmp += max_nnz; - hm2.values = - KokkosKernels::Impl::alignPtr(tmp); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), - [&](const nnz_lno_t &row_index) { - nnz_lno_t globally_used_hash_count = 0; - nnz_lno_t used_hash_sizes = 0; - - const size_type c_row_begin = rowmapC[row_index]; - const size_type c_row_end = rowmapC[row_index + 1]; - - const nnz_lno_t global_memory_hash_size = - nnz_lno_t(c_row_end - c_row_begin); - - const size_type col_begin = row_mapA[row_index]; - const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; - for (nnz_lno_t ii = 0; ii < left_work; ++ii) { - size_type a_col = col_begin + ii; - nnz_lno_t rowB = entriesA[a_col]; - scalar_t valA = valuesA[a_col]; - - size_type rowBegin = row_mapB(rowB); - nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; - - for (nnz_lno_t i = 0; i < left_workB; ++i) { - const size_type adjind = i + rowBegin; - nnz_lno_t b_col_ind = entriesB[adjind]; - scalar_t b_val = valuesB[adjind] * valA; - nnz_lno_t hash = b_col_ind & pow2_hash_func; - - // this has to be a success, we do not need to check for the - // success. int insertion = - hm2.sequential_insert_into_hash_mergeAdd_TrackHashes( - b_col_ind, b_val, &used_hash_sizes, &globally_used_hash_count, - globally_used_hash_indices); - } - } - for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) { - nnz_lno_t dirty_hash = globally_used_hash_indices[i]; - hm2.hash_begins[dirty_hash] = -1; - } - for (nnz_lno_t i = 0; i < global_memory_hash_size; ++i) { - pEntriesC[c_row_begin + i] = hm2.keys[i]; - pvaluesC[c_row_begin + i] = hm2.values[i]; - } - }); - memory_space.release_chunk(globally_used_hash_indices); - } -#endif - KOKKOS_INLINE_FUNCTION void operator()(const GPUTag &, const team_member_t &teamMember) const { nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; @@ -1719,130 +1636,5 @@ void KokkosBSPGEMM -template -void KokkosSPGEMM:: - KokkosSPGEMM_numeric_hash2( - c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, - c_scalar_nnz_view_t valuesC_, - KokkosKernels::Impl::ExecSpaceType my_exec_space_) { - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\tHASH MODE" << std::endl; - } - - nnz_lno_t brows = row_mapB.extent(0) - 1; - size_type bnnz = valsB.extent(0); - - int suggested_vector_size = - this->handle->get_suggested_vector_size(brows, bnnz); - int suggested_team_size = - this->handle->get_suggested_team_size(suggested_vector_size); - nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, concurrency, a_row_cnt); - - typedef KokkosKernels::Impl::UniformMemoryPool - pool_memory_space; - - nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz(); - nnz_lno_t min_hash_size = 1; - while (max_nnz > min_hash_size) { - min_hash_size *= 4; - } - - size_t chunksize = min_hash_size; // this is for used hash indices - chunksize += min_hash_size; // this is for the hash begins - chunksize += max_nnz; // this is for hash nexts - chunksize += max_nnz; // this is for indices - chunksize += - max_nnz * (sizeof(scalar_t) / sizeof(nnz_lno_t)); // this is for values - int num_chunks = concurrency / suggested_vector_size; - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize - << " numchunks:" << num_chunks << std::endl; - } - - KokkosKernels::Impl::PoolType my_pool_type = - KokkosKernels::Impl::OneThread2OneChunk; - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { - my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; - } - - Kokkos::Timer timer1; - pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); - MyExecSpace().fence(); - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; - std::cout << "\t\tPool Size(MB):" - << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024. - << std::endl; - } - double first_level_cut_off = - this->handle->get_spgemm_handle()->get_first_level_hash_cut_off(); - - PortableNumericCHASH< - const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t, - const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, - c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space> - sc(a_row_cnt, row_mapA, entriesA, valsA, - - row_mapB, entriesB, valsB, - - rowmapC_, entriesC_, valuesC_, shmem_size, suggested_vector_size, - m_space, min_hash_size, max_nnz, suggested_team_size, - - my_exec_space_, team_row_chunk_size, first_level_cut_off, - this->handle->get_spgemm_handle()->row_flops, KOKKOSKERNELS_VERBOSE); - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\t\tvector_size:" << suggested_vector_size - << " chunk_size:" << team_row_chunk_size << std::endl; - } - timer1.reset(); - - if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { - Kokkos::parallel_for( - "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY2", - gpu_team_policy_t(a_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), - sc); - MyExecSpace().fence(); - } else { - if (use_dynamic_schedule) { - Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_DYNAMIC", - dynamic_multicore_team_policy2_t( - a_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), - sc); - } else { - Kokkos::parallel_for( - "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_STATIC", - multicore_team_policy2_t(a_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), - sc); - } - MyExecSpace().fence(); - } - - if (KOKKOSKERNELS_VERBOSE) { - std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; - } -} -#endif - } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp index f9575322a8..312ba22f8a 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp @@ -63,90 +63,6 @@ KOKKOS_INLINE_FUNCTION kk_subview1d get_block( return Kokkos::subview(data, Kokkos::make_pair(i, i + block_size)); } -#if 0 // not used in block version -template -void spgemm_debug_symbolic(KernelHandle *handle, - typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t /* n */, - typename KernelHandle::nnz_lno_t k, - alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, - - bool /* transposeA */, blno_row_view_t_ row_mapB, - blno_nnz_view_t_ entriesB, bool /* transposeB */, - clno_row_view_t_ row_mapC) { - typename alno_row_view_t_::HostMirror h_rma = - Kokkos::create_mirror_view(row_mapA); - Kokkos::deep_copy(h_rma, row_mapA); - typename alno_nnz_view_t_::HostMirror h_enta = - Kokkos::create_mirror_view(entriesA); - Kokkos::deep_copy(h_enta, entriesA); - - typename blno_row_view_t_::HostMirror h_rmb = - Kokkos::create_mirror_view(row_mapB); - Kokkos::deep_copy(h_rmb, row_mapB); - typename blno_nnz_view_t_::HostMirror h_entb = - Kokkos::create_mirror_view(entriesB); - Kokkos::deep_copy(h_entb, entriesB); - typename clno_row_view_t_::HostMirror h_rmc = - Kokkos::create_mirror_view(row_mapC); - Kokkos::fence(); - - typedef typename KernelHandle::nnz_lno_t lno_t; - typedef typename KernelHandle::size_type size_type; - // typedef typename KernelHandle::nnz_scalar_t scalar_t; - - std::vector acc_flag(k, false); - - std::vector result_c_col_indices(k); - - size_type result_index = 0; - - h_rmc(0) = 0; - for (lno_t i = 0; i < m; ++i) { - const size_type a_row_begin = h_rma(i); - const size_type a_row_end = h_rma(i + 1); - lno_t a_row_size = a_row_end - a_row_begin; - lno_t row_size = 0; - - for (lno_t j = 0; j < a_row_size; ++j) { - size_type a_ind = a_row_begin + j; - lno_t col = h_enta(a_ind); - // scalar_t val = h_vala(a_ind); - - const size_type b_row_begin = h_rmb(col); - const size_type b_row_end = h_rmb(col + 1); - lno_t b_row_size = b_row_end - b_row_begin; - for (lno_t z = 0; z < b_row_size; ++z) { - size_type b_ind = b_row_begin + z; - lno_t b_col = h_entb(b_ind); - // scalar_t b_val = h_valb(b_ind); - // if (i == 0) std::cout << "\tb col:" << b_col << std::endl; - if (acc_flag[b_col] == false) { - acc_flag[b_col] = true; - result_c_col_indices[row_size++] = b_col; - } - } - } - result_index += row_size; - h_rmc(i + 1) = result_index; - // size_type c_row_begin = h_rmc(i); - - // if (i == 0) std::cout << "result_cols" << std::endl; - - for (lno_t j = 0; j < row_size; ++j) { - lno_t result_col = result_c_col_indices[j]; - acc_flag[result_col] = false; - } - } - - handle->get_spgemm_handle()->set_c_nnz(result_index); - Kokkos::deep_copy(row_mapC, h_rmc); - Kokkos::fence(); -} -#endif - template -int run_spgemm_old_interface(crsMat_t input_mat, crsMat_t input_mat2, - KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, - crsMat_t &result) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - typedef typename lno_view_t::value_type size_type; - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename scalar_view_t::value_type scalar_t; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; - - KernelHandle kh; - kh.set_team_work_size(16); - kh.set_dynamic_scheduling(true); - // kh.set_verbose(true); - - kh.create_spgemm_handle(spgemm_algorithm); - - const size_t num_rows_1 = input_mat.numRows(); - const size_t num_rows_2 = input_mat2.numRows(); - const size_t num_cols_2 = input_mat2.numCols(); - - const size_t num_cols_1 = input_mat.numCols(); - bool equal = num_rows_2 == num_cols_1; - if (!equal) return 1; - - lno_view_t row_mapC("non_const_lnow_row", num_rows_1 + 1); - lno_nnz_view_t entriesC; - scalar_view_t valuesC; - - spgemm_symbolic(&kh, num_rows_1, num_rows_2, num_cols_2, - input_mat.graph.row_map, input_mat.graph.entries, false, - input_mat2.graph.row_map, input_mat2.graph.entries, false, - row_mapC); - - size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); - entriesC = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), c_nnz_size); - valuesC = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - spgemm_numeric(&kh, num_rows_1, num_rows_2, num_cols_2, - input_mat.graph.row_map, input_mat.graph.entries, - input_mat.values, false, - - input_mat2.graph.row_map, input_mat2.graph.entries, - input_mat2.values, false, row_mapC, entriesC, valuesC); - - graph_t static_graph(entriesC, row_mapC); - result = crsMat_t("CrsMatrix", num_cols_2, valuesC, static_graph); - kh.destroy_spgemm_handle(); - - return 0; -} -#endif - template bool is_same_block_matrix(bsrMat_t output_mat_actual, bsrMat_t output_mat_reference) { @@ -352,75 +289,6 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, // device::execution_space::finalize(); } -#if 0 // TODO: specific SpGEMM case, not applicable in block version -template -void test_issue402() { - using namespace Test; - typedef CrsMatrix crsMat_t; - - // this specific matrix (from a circuit simulation) reliably replicated issue - // #402 (incorrect/crashing SPGEMM KKMEM) - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - const lno_t numRows = 1813; - const size_type nnz = 11156; - lno_view_t Arowmap("A rowmap", numRows + 1); - lno_nnz_view_t Aentries("A entries", nnz); - scalar_view_t Avalues("A values", nnz); - // Read out the matrix from the header file "matrixIssue402.hpp" - { - auto rowmapHost = Kokkos::create_mirror_view(Arowmap); - auto entriesHost = Kokkos::create_mirror_view(Aentries); - auto valuesHost = Kokkos::create_mirror_view(Avalues); - for (lno_t i = 0; i < numRows + 1; i++) - rowmapHost(i) = MatrixIssue402::rowmap[i]; - for (size_type i = 0; i < nnz; i++) { - entriesHost(i) = MatrixIssue402::entries[i]; - valuesHost(i) = MatrixIssue402::values[i]; - } - Kokkos::deep_copy(Arowmap, rowmapHost); - Kokkos::deep_copy(Aentries, entriesHost); - Kokkos::deep_copy(Avalues, valuesHost); - } - crsMat_t A("A", numRows, numRows, nnz, Avalues, Arowmap, Aentries); - // compute explicit transpose: the bug was replicated by computing AA' - lno_view_t Browmap("B = A^T rowmap", numRows + 1); - lno_nnz_view_t Bentries("B = A^T entries", nnz); - scalar_view_t Bvalues("B = A^T values", nnz); - KokkosKernels::Impl::transpose_matrix< - lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t, - scalar_view_t, lno_view_t, typename device::execution_space>( - numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues); - crsMat_t B("B=A^T", numRows, numRows, nnz, Bvalues, Browmap, Bentries); - crsMat_t Cgold; - run_spgemm(A, B, SPGEMM_DEBUG, Cgold); - crsMat_t C; - bool success = true; - std::string errMsg; - try { - int res = run_spgemm(A, B, SPGEMM_KK_MEMORY, C); - if (res) throw "run_spgemm returned error code"; - } catch (const char *message) { - errMsg = message; - success = false; - } catch (std::string message) { - errMsg = message; - success = false; - } catch (std::exception &e) { - errMsg = e.what(); - success = false; - } - EXPECT_TRUE(success) << "KKMEM still has issue 402 bug! Error message:\n" - << errMsg << '\n'; - bool correctResult = is_same_matrix(C, Cgold); - EXPECT_TRUE(correctResult) - << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n"; -} -#endif - // Note: Tests with shared memory specified aim to trigger specific GPU functors // dispatched by matrix size and the available shared memory. #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From dcc1600afda2a552b91c741e6cdc4cb1146756f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 29 Mar 2022 14:53:18 +0200 Subject: [PATCH 105/261] Disable unavailable TPL implementations --- .../KokkosSparse_bspgemm_numeric_spec.hpp | 38 +++++-------------- unit_test/sparse/Test_Sparse_bspgemm.hpp | 16 ++++---- 2 files changed, 18 insertions(+), 36 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp index 075080a45b..d87c49bd55 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -224,39 +224,19 @@ struct BSPGEMM_NUMERIC< switch (sh->get_algorithm_type()) { case SPGEMM_CUSPARSE: - cuSPARSE_apply( - sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, - entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC); - break; + throw std::runtime_error( + "cuSPARSE implementation for block SpGEMM is not available"); case SPGEMM_CUSP: - CUSP_apply(sh, m, n, k, row_mapA, entriesA, valuesA, - transposeA, row_mapB, entriesB, valuesB, - transposeB, row_mapC, entriesC, valuesC); - break; + throw std::runtime_error( + "CUSP implementation for block SpGEMM is not available"); case SPGEMM_MKL: -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, - row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, - valuesC, handle->get_verbose()); -#else - throw std::runtime_error("MKL was not enabled in this build!"); -#endif - break; case SPGEMM_MKL2PHASE: - mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, - row_mapB, entriesB, valuesB, transposeB, row_mapC, - entriesC, valuesC, handle->get_verbose()); - break; - + throw std::runtime_error( + "MKL implementation available for block SpGEMM is not available"); case SPGEMM_VIENNA: - viennaCL_apply( - sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, - entriesB, valuesB, transposeB, row_mapC, entriesC, valuesC, - handle->get_verbose()); - break; + throw std::runtime_error( + "Vienna implementation available for block SpGEMM is not " + "available"); default: diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp index 0e4471757d..4d4ee10157 100644 --- a/unit_test/sparse/Test_Sparse_bspgemm.hpp +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -199,8 +199,10 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, shared_memory_size); std::vector algorithms = { - SPGEMM_KK, SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, - SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ + SPGEMM_KK, + SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, + SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */, + SPGEMM_MKL /* verify failure in case of missing build */, }; if (!KokkosKernels::Impl::kk_is_gpu_exec_space< @@ -210,10 +212,6 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, algorithms.push_back(SPGEMM_KK_LP); } -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - algorithms.push_back(SPGEMM_MKL); -#endif - for (auto spgemm_algorithm : algorithms) { const uint64_t max_integer = Kokkos::ArithTraits::max(); std::string algo = "UNKNOWN"; @@ -228,11 +226,15 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, #endif break; - case SPGEMM_MKL: algo = "SPGEMM_MKL"; + case SPGEMM_MKL: + algo = "SPGEMM_MKL"; + is_expected_to_fail = !is_empy_case; // TODO: add block MKL impl #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { is_expected_to_fail = true; } +#else + is_expected_to_fail = true; // fail: MKL not enabled in build #endif // MKL requires local ordinals to be int. // Note: empty-array special case will NOT fail on this. From b712a5692396e02a4350438dfc6dbd4cf949f878 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Wed, 30 Mar 2022 01:13:03 +0200 Subject: [PATCH 106/261] add some explanation to CPU functor in speed method --- src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp index 507511ef85..372e5d10dd 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -156,6 +156,11 @@ struct KokkosBSPGEMM Date: Fri, 22 Apr 2022 16:09:00 -0600 Subject: [PATCH 107/261] Use atomic_add and process rows in chunks at each level --- src/sparse/KokkosSparse_spiluk_handle.hpp | 47 ++++- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 96 ++++++---- .../KokkosSparse_spiluk_symbolic_impl.hpp | 171 +++++++++++++++--- 3 files changed, 253 insertions(+), 61 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 522e0461d5..2b58a2aa72 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -100,12 +100,17 @@ class SPILUKHandle { nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level + nnz_lno_view_t level_nchunks; //number of chunks of rows at each level + nnz_lno_view_t + level_nrowsperchunk; //maximum number of rows among chunks at each level size_type nrows; - size_type nlevel; + size_type nlevels; size_type nnzL; size_type nnzU; - size_type level_maxrows; // maximum number of rows of levels + size_type level_maxrows; // max. number of rows among levels + size_type + level_maxrowsperchunk;//max.number of rows among chunks among levels bool symbolic_complete; @@ -121,11 +126,14 @@ class SPILUKHandle { : level_list(), level_idx(), level_ptr(), + level_nchunks(), + level_nrowsperchunk(), nrows(nrows_), - nlevel(0), + nlevels(0), nnzL(nnzL_), nnzU(nnzU_), level_maxrows(0), + level_maxrowsperchunk(0), symbolic_complete(symbolic_complete_), algm(choice), team_size(-1), @@ -138,9 +146,12 @@ class SPILUKHandle { set_nnzL(nnzL_); set_nnzU(nnzU_); set_level_maxrows(0); + set_level_maxrowsperchunk(0); level_list = nnz_row_view_t("level_list", nrows_), level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_t(), + level_nrowsperchunk = nnz_lno_view_t(), reset_symbolic_complete(); } @@ -159,6 +170,22 @@ class SPILUKHandle { KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_level_ptr() const { return level_ptr; } + KOKKOS_INLINE_FUNCTION + nnz_lno_view_t get_level_nchunks() const { return level_nchunks; } + + void alloc_level_nchunks(const size_type nlevels_) { + level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_); + } + + KOKKOS_INLINE_FUNCTION + nnz_lno_view_t get_level_nrowsperchunk() const { + return level_nrowsperchunk; + } + + void alloc_level_nrowsperchunk(const size_type nlevels_) { + level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); + } + KOKKOS_INLINE_FUNCTION size_type get_nrows() const { return nrows; } @@ -185,10 +212,18 @@ class SPILUKHandle { this->level_maxrows = level_maxrows_; } + KOKKOS_INLINE_FUNCTION + size_type get_level_maxrowsperchunk() const { return level_maxrowsperchunk; } + + KOKKOS_INLINE_FUNCTION + void set_level_maxrowsperchunk(const size_type level_maxrowsperchunk_) { + this->level_maxrowsperchunk = level_maxrowsperchunk_; + } + bool is_symbolic_complete() const { return symbolic_complete; } - size_type get_num_levels() const { return nlevel; } - void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; } + size_type get_num_levels() const { return nlevels; } + void set_num_levels(size_type nlevels_) { this->nlevels = nlevels_; } void set_symbolic_complete() { this->symbolic_complete = true; } void reset_symbolic_complete() { this->symbolic_complete = false; } @@ -202,11 +237,9 @@ class SPILUKHandle { void print_algorithm() { if (algm == SPILUKAlgorithm::SEQLVLSCHD_RP) std::cout << "SEQLVLSCHD_RP" << std::endl; - ; if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; - ; /* if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 6a1300d747..5ce550653b 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -323,9 +323,9 @@ struct ILUKLvlSchedTP1NumericFunctor { if (ipos != -1) { auto lxu = -U_values(kk) * fact; if (col < rowid) - L_values(ipos) += lxu; + Kokkos::atomic_add (&L_values(ipos), lxu); else - U_values(ipos) += lxu; + Kokkos::atomic_add (&U_values(ipos), lxu); } }); // end for kk @@ -383,28 +383,46 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using WorkViewType = + Kokkos::View>; + using LevelHostViewType = Kokkos::View; size_type nlevels = thandle.get_num_levels(); size_type nrows = thandle.get_nrows(); - // Keep this as host View, create device version and copy to back to host + // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); + HandleDeviceEntriesType level_nrowsperchunk = + thandle.get_level_nrowsperchunk(); + // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM // space, a fence would be required before each access since UVM views can // share pages. - Kokkos::View level_ptr_h( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), - level_ptr.extent(0)); - Kokkos::deep_copy(level_ptr_h, level_ptr); - - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + LevelHostViewType level_ptr_h, level_nchunks_h, level_nrowsperchunk_h; + WorkViewType iw; - using WorkViewType = - Kokkos::View>; + level_ptr_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), + level_ptr.extent(0)); + Kokkos::deep_copy(level_ptr_h, level_ptr); - WorkViewType iw("iw", thandle.get_level_maxrows(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); + if ( thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { + level_nchunks_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), + level_nchunks.extent(0)); + level_nrowsperchunk_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nrowsperchunk"), + level_nrowsperchunk.extent(0)); + Kokkos::deep_copy(level_nchunks_h, level_nchunks); + Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); + iw = WorkViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), thandle.get_level_maxrowsperchunk(), nrows ); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } + else { + iw = WorkViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), thandle.get_level_maxrows(), nrows ); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -424,25 +442,41 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + } else if ( thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); - - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start); - if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", - policy_type(lev_end - lev_start, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for("parfor_l_team", - policy_type(lev_end - lev_start, team_size), - tstf); + int team_size = thandle.get_team_size(); + + nnz_lno_t lvl_rowid_start = 0; + nnz_lno_t lvl_nrows_chunk; + for(int chunkid=0; chunkid + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start)-lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, + LRowMapType, LEntriesType, LValuesType, + URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, + level_idx, iw, lev_start+lvl_rowid_start); + + if ( team_size == -1 ) + Kokkos::parallel_for("parfor_l_team", + policy_type( lvl_nrows_chunk , Kokkos::AUTO ), + tstf); + else + Kokkos::parallel_for("parfor_l_team", + policy_type( lvl_nrows_chunk , team_size ), + tstf); + + lvl_rowid_start += lvl_nrows_chunk; + } } // /* // // TP2 algorithm has issues with some offset-ordinal combo to be diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index ff464951c7..672ba1f8fe 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -63,12 +63,14 @@ namespace Experimental { template void level_sched(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, const size_type nrows, - LevelType1& level_list, LevelType2& level_ptr, - LevelType2& level_idx, size_type& nlevels) { + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host - typedef typename IlukHandle::nnz_lno_t nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + + size_type nrows = thandle.get_nrows(); nlevels = 0; level_ptr(0) = 0; @@ -117,6 +119,111 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); } +//SEQLVLSCHD_TP1 algorithm (chunks) +template +void level_sched ( IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, + size_type &nlevels ) { + // Scheduling currently compute on host + + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using memory_space = typename IlukHandle::memory_space; + + size_type nrows = thandle.get_nrows(); + + nlevels = 0; + level_ptr(0) = 0; + + for ( size_type i = 0; i < nrows; ++i ) { + size_type l = 0; + size_type rowstart= row_map(i); + size_type rowend = row_map(i+1); + for ( size_type j = rowstart; j < rowend; ++j ) { + nnz_lno_t col = entries(j); + l = std::max(l, level_list(col)); + } + level_list(i) = l+1; + level_ptr(l+1) += 1; + nlevels = std::max(nlevels, l+1); + } + + for ( size_type i = 1; i <= nlevels; ++i ) { + level_ptr(i) += level_ptr(i-1); + } + + for ( size_type i = 0; i < nrows; i++ ) { + level_idx(level_ptr(level_list(i)-1)) = i; + level_ptr(level_list(i)-1) += 1; + } + + if (nlevels>0) {// note: to avoid wrapping around to the max of size_t + // when nlevels = 0. + for ( size_type i = nlevels-1; i > 0; --i ) { + level_ptr(i) = level_ptr(i-1); + } + } + + level_ptr(0) = 0; + + // Find max rows, number of chunks, max rows of chunks across levels + using HostViewType = Kokkos::View; + + HostViewType lnchunks( "lnchunks", nlevels ); + HostViewType lnrowsperchunk( "lnrowsperchunk", nlevels ); + + size_t avail_byte = 0; +#ifdef KOKKOS_ENABLE_CUDA + if ( std::is_same< memory_space, Kokkos::CudaSpace >::value ) { + size_t free_byte, total_byte; + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); + avail_byte = static_cast(0.85*free_byte); + } +#endif + + size_type maxrows = 0; + size_type maxrowsperchunk = 0; + for ( size_type i = 0; i < nlevels; ++i ) { + size_type lnrows = level_ptr(i+1) - level_ptr(i); + if( maxrows < lnrows ) { + maxrows = lnrows; + } +#ifdef KOKKOS_ENABLE_CUDA + size_t required_size = static_cast(lnrows)*nrows*sizeof(nnz_lno_t); + if ( std::is_same< memory_space, Kokkos::CudaSpace >::value ) + { + lnchunks(i) = required_size/avail_byte+1; + lnrowsperchunk(i) = (lnrows%lnchunks(i)==0)?(lnrows/lnchunks(i)): + (lnrows/lnchunks(i)+1); + } + else +#endif + { + lnchunks(i) = 1; + lnrowsperchunk(i) = lnrows; + } + if( maxrowsperchunk < lnrowsperchunk(i) ) { + maxrowsperchunk = lnrowsperchunk(i); + } + } + + thandle.set_num_levels(nlevels); + thandle.set_level_maxrows(maxrows); + thandle.set_level_maxrowsperchunk(maxrowsperchunk); + + level_nchunks = lnchunks; + level_nrowsperchunk = lnrowsperchunk; + +} + // Linear Search for the smallest row index template size_type search_col_index(nnz_lno_t j, size_type lenl, ViewType h_iL, @@ -166,11 +273,11 @@ void iluk_symbolic(IlukHandle& thandle, // Scheduling and symbolic phase currently compute on host - need host copy // of all views - typedef typename IlukHandle::size_type size_type; - typedef typename IlukHandle::nnz_lno_t nnz_lno_t; + using size_type = typename IlukHandle::size_type; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; - typedef typename IlukHandle::nnz_lno_view_t HandleDeviceEntriesType; - typedef typename IlukHandle::nnz_row_view_t HandleDeviceRowMapType; + using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + using HandleDeviceRowMapType = typename IlukHandle::nnz_row_view_t; // typedef typename IlukHandle::signed_integral_t signed_integral_t; @@ -217,13 +324,14 @@ void iluk_symbolic(IlukHandle& thandle, // Can only resize managed views Kokkos::resize(L_entries_d, // L_entries_d.extent(0)-3); thandle.set_nnzL(L_entries_d.extent(0)+5); - typedef Kokkos::View - HostTmpViewType; + using HostTmpViewType = + Kokkos::View; HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); HostTmpViewType h_llev("h_llev", nrows); + HostTmpViewType level_nchunks, level_nrowsperchunk; size_type cntL = 0; size_type cntU = 0; @@ -367,8 +475,32 @@ void iluk_symbolic(IlukHandle& thandle, } // Level scheduling on L - level_sched(thandle, L_row_map, L_entries, nrows, level_list, level_ptr, - level_idx, nlev); + if ( thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { + level_sched (thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, level_nchunks, level_nrowsperchunk, nlev); + + thandle.alloc_level_nchunks(nlev); + thandle.alloc_level_nrowsperchunk(nlev); + HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks(); + HandleDeviceEntriesType dlevel_nrowsperchunk = + thandle.get_level_nrowsperchunk(); + Kokkos::deep_copy(dlevel_nchunks, level_nchunks); + Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk); + } + else { + level_sched (thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); + } + + Kokkos::deep_copy(dlevel_ptr, level_ptr); + Kokkos::deep_copy(dlevel_idx, level_idx); + Kokkos::deep_copy(dlevel_list, level_list); + + Kokkos::deep_copy(L_row_map_d, L_row_map); + Kokkos::deep_copy(L_entries_d, L_entries); + Kokkos::deep_copy(U_row_map_d, U_row_map); + Kokkos::deep_copy(U_entries_d, U_entries); thandle.set_symbolic_complete(); @@ -378,9 +510,11 @@ void iluk_symbolic(IlukHandle& thandle, std::cout << " symbolic complete: " << thandle.is_symbolic_complete() << std::endl; std::cout << " num levels: " << thandle.get_num_levels() << std::endl; - std::cout << " max num rows levels: " << thandle.get_level_maxrows() + std::cout << " max num rows among levels: " << thandle.get_level_maxrows() << std::endl; - + std::cout << " max num rows among chunks among levels: " + << thandle.get_level_maxrowsperchunk() << std::endl; + std::cout << " iluk_symbolic result: " << std::endl; std::cout << " level_list = "; @@ -427,15 +561,6 @@ void iluk_symbolic(IlukHandle& thandle, } std::cout << std::endl; #endif - - Kokkos::deep_copy(dlevel_ptr, level_ptr); - Kokkos::deep_copy(dlevel_idx, level_idx); - Kokkos::deep_copy(dlevel_list, level_list); - - Kokkos::deep_copy(L_row_map_d, L_row_map); - Kokkos::deep_copy(L_entries_d, L_entries); - Kokkos::deep_copy(U_row_map_d, U_row_map); - Kokkos::deep_copy(U_entries_d, U_entries); } } // end iluk_symbolic From fd92857b927d18d078b06624db50f4e6c42b3ed8 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Fri, 22 Apr 2022 16:34:57 -0600 Subject: [PATCH 108/261] Apply clang format --- src/sparse/KokkosSparse_spiluk_handle.hpp | 27 ++-- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 94 ++++++------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 130 +++++++++--------- 3 files changed, 122 insertions(+), 129 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 2b58a2aa72..3cabcd0f73 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -100,17 +100,17 @@ class SPILUKHandle { nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level - nnz_lno_view_t level_nchunks; //number of chunks of rows at each level - nnz_lno_view_t - level_nrowsperchunk; //maximum number of rows among chunks at each level + nnz_lno_view_t level_nchunks; // number of chunks of rows at each level + nnz_lno_view_t + level_nrowsperchunk; // maximum number of rows among chunks at each level size_type nrows; size_type nlevels; size_type nnzL; size_type nnzU; size_type level_maxrows; // max. number of rows among levels - size_type - level_maxrowsperchunk;//max.number of rows among chunks among levels + size_type + level_maxrowsperchunk; // max.number of rows among chunks among levels bool symbolic_complete; @@ -147,11 +147,10 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_t(), - level_nrowsperchunk = nnz_lno_view_t(), + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), reset_symbolic_complete(); } @@ -178,9 +177,7 @@ class SPILUKHandle { } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nrowsperchunk() const { - return level_nrowsperchunk; - } + nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } void alloc_level_nrowsperchunk(const size_type nlevels_) { level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); @@ -216,8 +213,8 @@ class SPILUKHandle { size_type get_level_maxrowsperchunk() const { return level_maxrowsperchunk; } KOKKOS_INLINE_FUNCTION - void set_level_maxrowsperchunk(const size_type level_maxrowsperchunk_) { - this->level_maxrowsperchunk = level_maxrowsperchunk_; + void set_level_maxrowsperchunk(const size_type level_maxrowsperchunk_) { + this->level_maxrowsperchunk = level_maxrowsperchunk_; } bool is_symbolic_complete() const { return symbolic_complete; } diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 5ce550653b..d0b80ace69 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -323,9 +323,9 @@ struct ILUKLvlSchedTP1NumericFunctor { if (ipos != -1) { auto lxu = -U_values(kk) * fact; if (col < rowid) - Kokkos::atomic_add (&L_values(ipos), lxu); + Kokkos::atomic_add(&L_values(ipos), lxu); else - Kokkos::atomic_add (&U_values(ipos), lxu); + Kokkos::atomic_add(&U_values(ipos), lxu); } }); // end for kk @@ -383,19 +383,19 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = - Kokkos::View>; - using LevelHostViewType = Kokkos::View; + using WorkViewType = + Kokkos::View>; + using LevelHostViewType = Kokkos::View; size_type nlevels = thandle.get_num_levels(); size_type nrows = thandle.get_nrows(); // Keep these as host View, create device version and copy back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType level_nrowsperchunk = - thandle.get_level_nrowsperchunk(); + HandleDeviceEntriesType level_nrowsperchunk = + thandle.get_level_nrowsperchunk(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -404,23 +404,28 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, LevelHostViewType level_ptr_h, level_nchunks_h, level_nrowsperchunk_h; WorkViewType iw; - level_ptr_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), - level_ptr.extent(0)); + level_ptr_h = LevelHostViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level pointers"), + level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); - if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { - level_nchunks_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), - level_nchunks.extent(0)); - level_nrowsperchunk_h = LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nrowsperchunk"), - level_nrowsperchunk.extent(0)); - Kokkos::deep_copy(level_nchunks_h, level_nchunks); + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_nchunks_h = LevelHostViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), + level_nchunks.extent(0)); + level_nrowsperchunk_h = + LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "Host level nrowsperchunk"), + level_nrowsperchunk.extent(0)); + Kokkos::deep_copy(level_nchunks_h, level_nchunks); Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); - iw = WorkViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), thandle.get_level_maxrowsperchunk(), nrows ); + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrowsperchunk(), nrows); Kokkos::deep_copy(iw, nnz_lno_t(-1)); - } - else { - iw = WorkViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), thandle.get_level_maxrows(), nrows ); + } else { + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrows(), nrows); Kokkos::deep_copy(iw, nnz_lno_t(-1)); } @@ -442,39 +447,36 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start)); - } else if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); - + int team_size = thandle.get_team_size(); + nnz_lno_t lvl_rowid_start = 0; nnz_lno_t lvl_nrows_chunk; - for(int chunkid=0; chunkid + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start)-lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, - LRowMapType, LEntriesType, LValuesType, - URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, - level_idx, iw, lev_start+lvl_rowid_start); - - if ( team_size == -1 ) + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, level_idx, iw, + lev_start + lvl_rowid_start); + + if (team_size == -1) Kokkos::parallel_for("parfor_l_team", - policy_type( lvl_nrows_chunk , Kokkos::AUTO ), + policy_type(lvl_nrows_chunk, Kokkos::AUTO), tstf); else Kokkos::parallel_for("parfor_l_team", - policy_type( lvl_nrows_chunk , team_size ), - tstf); - + policy_type(lvl_nrows_chunk, team_size), tstf); + lvl_rowid_start += lvl_nrows_chunk; } } diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 672ba1f8fe..5a97665179 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -68,7 +68,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, size_type& nlevels) { // Scheduling currently compute on host - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; size_type nrows = thandle.get_nrows(); @@ -119,19 +119,14 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); } -//SEQLVLSCHD_TP1 algorithm (chunks) -template -void level_sched ( IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, - LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, - size_type &nlevels ) { +// SEQLVLSCHD_TP1 algorithm (chunks) +template +void level_sched(IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, + size_type& nlevels) { // Scheduling currently compute on host using nnz_lno_t = typename IlukHandle::nnz_lno_t; @@ -142,75 +137,76 @@ void level_sched ( IlukHandle& thandle, const RowMapType row_map, nlevels = 0; level_ptr(0) = 0; - for ( size_type i = 0; i < nrows; ++i ) { - size_type l = 0; - size_type rowstart= row_map(i); - size_type rowend = row_map(i+1); - for ( size_type j = rowstart; j < rowend; ++j ) { + for (size_type i = 0; i < nrows; ++i) { + size_type l = 0; + size_type rowstart = row_map(i); + size_type rowend = row_map(i + 1); + for (size_type j = rowstart; j < rowend; ++j) { nnz_lno_t col = entries(j); - l = std::max(l, level_list(col)); + l = std::max(l, level_list(col)); } - level_list(i) = l+1; - level_ptr(l+1) += 1; - nlevels = std::max(nlevels, l+1); + level_list(i) = l + 1; + level_ptr(l + 1) += 1; + nlevels = std::max(nlevels, l + 1); } - for ( size_type i = 1; i <= nlevels; ++i ) { - level_ptr(i) += level_ptr(i-1); + for (size_type i = 1; i <= nlevels; ++i) { + level_ptr(i) += level_ptr(i - 1); } - for ( size_type i = 0; i < nrows; i++ ) { - level_idx(level_ptr(level_list(i)-1)) = i; - level_ptr(level_list(i)-1) += 1; + for (size_type i = 0; i < nrows; i++) { + level_idx(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; } - if (nlevels>0) {// note: to avoid wrapping around to the max of size_t - // when nlevels = 0. - for ( size_type i = nlevels-1; i > 0; --i ) { - level_ptr(i) = level_ptr(i-1); + if (nlevels > 0) { // note: to avoid wrapping around to the max of size_t + // when nlevels = 0. + for (size_type i = nlevels - 1; i > 0; --i) { + level_ptr(i) = level_ptr(i - 1); } } level_ptr(0) = 0; // Find max rows, number of chunks, max rows of chunks across levels - using HostViewType = Kokkos::View; + using HostViewType = + Kokkos::View; - HostViewType lnchunks( "lnchunks", nlevels ); - HostViewType lnrowsperchunk( "lnrowsperchunk", nlevels ); + HostViewType lnchunks("lnchunks", nlevels); + HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); size_t avail_byte = 0; #ifdef KOKKOS_ENABLE_CUDA - if ( std::is_same< memory_space, Kokkos::CudaSpace >::value ) { + if (std::is_same::value) { size_t free_byte, total_byte; - KokkosKernels::Impl::kk_get_free_total_memory(free_byte, total_byte); - avail_byte = static_cast(0.85*free_byte); + KokkosKernels::Impl::kk_get_free_total_memory(free_byte, + total_byte); + avail_byte = static_cast(0.85 * free_byte); } #endif - size_type maxrows = 0; + size_type maxrows = 0; size_type maxrowsperchunk = 0; - for ( size_type i = 0; i < nlevels; ++i ) { - size_type lnrows = level_ptr(i+1) - level_ptr(i); - if( maxrows < lnrows ) { + for (size_type i = 0; i < nlevels; ++i) { + size_type lnrows = level_ptr(i + 1) - level_ptr(i); + if (maxrows < lnrows) { maxrows = lnrows; } #ifdef KOKKOS_ENABLE_CUDA - size_t required_size = static_cast(lnrows)*nrows*sizeof(nnz_lno_t); - if ( std::is_same< memory_space, Kokkos::CudaSpace >::value ) - { - lnchunks(i) = required_size/avail_byte+1; - lnrowsperchunk(i) = (lnrows%lnchunks(i)==0)?(lnrows/lnchunks(i)): - (lnrows/lnchunks(i)+1); - } - else + size_t required_size = + static_cast(lnrows) * nrows * sizeof(nnz_lno_t); + if (std::is_same::value) { + lnchunks(i) = required_size / avail_byte + 1; + lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) + ? (lnrows / lnchunks(i)) + : (lnrows / lnchunks(i) + 1); + } else #endif { - lnchunks(i) = 1; + lnchunks(i) = 1; lnrowsperchunk(i) = lnrows; } - if( maxrowsperchunk < lnrowsperchunk(i) ) { + if (maxrowsperchunk < lnrowsperchunk(i)) { maxrowsperchunk = lnrowsperchunk(i); } } @@ -219,9 +215,8 @@ void level_sched ( IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - level_nchunks = lnchunks; + level_nchunks = lnchunks; level_nrowsperchunk = lnrowsperchunk; - } // Linear Search for the smallest row index @@ -324,8 +319,8 @@ void iluk_symbolic(IlukHandle& thandle, // Can only resize managed views Kokkos::resize(L_entries_d, // L_entries_d.extent(0)-3); thandle.set_nnzL(L_entries_d.extent(0)+5); - using HostTmpViewType = - Kokkos::View; + using HostTmpViewType = + Kokkos::View; HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); @@ -475,22 +470,21 @@ void iluk_symbolic(IlukHandle& thandle, } // Level scheduling on L - if ( thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 ) { - level_sched (thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, level_nchunks, level_nrowsperchunk, nlev); + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, level_nchunks, level_nrowsperchunk, nlev); thandle.alloc_level_nchunks(nlev); thandle.alloc_level_nrowsperchunk(nlev); HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType dlevel_nrowsperchunk = - thandle.get_level_nrowsperchunk(); + HandleDeviceEntriesType dlevel_nrowsperchunk = + thandle.get_level_nrowsperchunk(); Kokkos::deep_copy(dlevel_nchunks, level_nchunks); Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk); - } - else { - level_sched (thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, nlev); + } else { + level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); } Kokkos::deep_copy(dlevel_ptr, level_ptr); @@ -514,7 +508,7 @@ void iluk_symbolic(IlukHandle& thandle, << std::endl; std::cout << " max num rows among chunks among levels: " << thandle.get_level_maxrowsperchunk() << std::endl; - + std::cout << " iluk_symbolic result: " << std::endl; std::cout << " level_list = "; From 1b21ab746eb87c1da1bd1684aca84b97405105d2 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 22 Apr 2022 17:31:07 -0600 Subject: [PATCH 109/261] Fix some warnings --- src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 5a97665179..a455fa355b 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -130,7 +130,6 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // Scheduling currently compute on host using nnz_lno_t = typename IlukHandle::nnz_lno_t; - using memory_space = typename IlukHandle::memory_space; size_type nrows = thandle.get_nrows(); @@ -175,8 +174,9 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, HostViewType lnchunks("lnchunks", nlevels); HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); - size_t avail_byte = 0; #ifdef KOKKOS_ENABLE_CUDA + using memory_space = typename IlukHandle::memory_space; + size_t avail_byte = 0; if (std::is_same::value) { size_t free_byte, total_byte; KokkosKernels::Impl::kk_get_free_total_memory(free_byte, @@ -206,7 +206,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, lnchunks(i) = 1; lnrowsperchunk(i) = lnrows; } - if (maxrowsperchunk < lnrowsperchunk(i)) { + if (maxrowsperchunk < static_cast(lnrowsperchunk(i))) { maxrowsperchunk = lnrowsperchunk(i); } } From fe0020936549d59c3f2285a7d7b64516f3f6900a Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Fri, 22 Apr 2022 17:40:50 -0600 Subject: [PATCH 110/261] Fix clang format --- src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index a455fa355b..90bb88e057 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -129,7 +129,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, size_type& nlevels) { // Scheduling currently compute on host - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; size_type nrows = thandle.get_nrows(); @@ -176,7 +176,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, #ifdef KOKKOS_ENABLE_CUDA using memory_space = typename IlukHandle::memory_space; - size_t avail_byte = 0; + size_t avail_byte = 0; if (std::is_same::value) { size_t free_byte, total_byte; KokkosKernels::Impl::kk_get_free_total_memory(free_byte, From 3381f9bff40d3414c1eeac046cfae7b5ce0367e2 Mon Sep 17 00:00:00 2001 From: kliegeois Date: Thu, 31 Mar 2022 06:57:54 -0600 Subject: [PATCH 111/261] Update Batched GMRES --- example/batched_solve/CMakeLists.txt | 7 +- example/batched_solve/examples_helper.hpp | 85 ++++ example/batched_solve/team_GMRES.cpp | 358 +++++++++++++++++ src/batched/KokkosBatched_Util.hpp | 11 + src/batched/dense/KokkosBatched_Copy_Decl.hpp | 2 +- .../KokkosBatched_Gemv_TeamVector_Impl.hpp | 28 +- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 63 +++ .../impl/KokkosBatched_Gemv_Team_Impl.hpp | 28 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 56 +++ src/batched/sparse/KokkosBatched_CG.hpp | 13 +- .../sparse/KokkosBatched_CrsMatrix.hpp | 88 +--- src/batched/sparse/KokkosBatched_GMRES.hpp | 19 +- src/batched/sparse/KokkosBatched_Identity.hpp | 12 +- .../sparse/KokkosBatched_JacobiPrec.hpp | 41 +- .../sparse/KokkosBatched_Krylov_Handle.hpp | 376 +++++++++++++++++- .../impl/KokkosBatched_CG_TeamVector_Impl.hpp | 54 ++- .../impl/KokkosBatched_CG_Team_Impl.hpp | 54 ++- .../impl/KokkosBatched_GMRES_Serial_Impl.hpp | 333 ++++++++++++++++ .../KokkosBatched_GMRES_TeamVector_Impl.hpp | 331 +++++++++------ .../impl/KokkosBatched_GMRES_Team_Impl.hpp | 328 +++++++++------ .../sparse/Test_Batched_SerialGMRES.hpp | 239 +++++++++++ .../sparse/Test_Batched_SerialGMRES_Real.hpp | 12 + .../batched/sparse/Test_Batched_Sparse.hpp | 2 + .../batched/sparse/Test_Batched_TeamCG.hpp | 28 +- .../batched/sparse/Test_Batched_TeamGMRES.hpp | 103 ++++- .../sparse/Test_Batched_TeamVectorCG.hpp | 23 +- .../sparse/Test_Batched_TeamVectorGMRES.hpp | 69 +++- 27 files changed, 2333 insertions(+), 430 deletions(-) create mode 100644 example/batched_solve/team_GMRES.cpp create mode 100644 src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp create mode 100644 unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp diff --git a/example/batched_solve/CMakeLists.txt b/example/batched_solve/CMakeLists.txt index da55b170cd..2e3ce96523 100644 --- a/example/batched_solve/CMakeLists.txt +++ b/example/batched_solve/CMakeLists.txt @@ -4,4 +4,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( static_pivoting SOURCES static_pivoting.cpp - ) \ No newline at end of file + ) + +KOKKOSKERNELS_ADD_EXECUTABLE( + team_GMRES + SOURCES team_GMRES.cpp + ) diff --git a/example/batched_solve/examples_helper.hpp b/example/batched_solve/examples_helper.hpp index ffd774967b..41b936a35c 100644 --- a/example/batched_solve/examples_helper.hpp +++ b/example/batched_solve/examples_helper.hpp @@ -148,4 +148,89 @@ void create_saddle_point_matrices(const MatrixViewType &A, Kokkos::deep_copy(Y, Y_host); Kokkos::fence(); +} + +template +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, + const int N, const IntView &r, + const IntView &c, + const VectorViewType &D, + const VectorViewType &X, + const VectorViewType &B) { + Kokkos::Random_XorShift64_Pool< + typename VectorViewType::device_type::execution_space> + random(13718); + Kokkos::fill_random( + X, random, + Kokkos::reduction_identity::prod()); + Kokkos::fill_random( + B, random, + Kokkos::reduction_identity::prod()); + + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + r_host(0) = 0; + + int current_col = 0; + + for (int i = 0; i < BlkSize; ++i) { + r_host(i + 1) = r_host(i) + (i == 0 || i == (BlkSize - 1) ? 2 : 3); + } + for (int i = 0; i < nnz; ++i) { + if (i % 3 == 0) { + for (int l = 0; l < N; ++l) { + D_host(l, i) = typename VectorViewType::value_type(2.0); + } + c_host(i) = current_col; + ++current_col; + } else { + for (int l = 0; l < N; ++l) { + D_host(l, i) = typename VectorViewType::value_type(-1.0); + } + c_host(i) = current_col; + if (i % 3 == 1) + --current_col; + else + ++current_col; + } + } + + Kokkos::fence(); + + Kokkos::deep_copy(D, D_host); + Kokkos::deep_copy(r, r_host); + Kokkos::deep_copy(c, c_host); + + Kokkos::fence(); +} + +template +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, + const VType &diag) { + auto diag_values_host = Kokkos::create_mirror_view(diag); + auto values_host = Kokkos::create_mirror_view(V); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, V); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + int N = diag.extent(0); + int BlkSize = diag.extent(1); + + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) { + diag_values_host(j, i) = 1. / values_host(j, current_index); + } + } + + Kokkos::deep_copy(diag, diag_values_host); } \ No newline at end of file diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp new file mode 100644 index 0000000000..b94ad00709 --- /dev/null +++ b/example/batched_solve/team_GMRES.cpp @@ -0,0 +1,358 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include + +#define KOKKOSKERNELS_DEBUG_LEVEL 0 + +#include "Kokkos_Core.hpp" +#include "Kokkos_Timer.hpp" +#include "Kokkos_Random.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include "Kokkos_Sort.hpp" + +/// KokkosKernels headers +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" +#include "KokkosKernels_IOUtils.hpp" + +#include +#include +#include "examples_helper.hpp" +#include +#include +#include +#include +#include + +typedef Kokkos::DefaultExecutionSpace exec_space; + +template +struct Functor_TestBatchedTeamVectorGMRES { + const ValuesViewType _D; + const ValuesViewType _diag; + const IntView _r; + const IntView _c; + const VectorViewType _X; + const VectorViewType _B; + const int _N_team, _team_size, _vector_length; + const int _N_iteration; + const double _tol; + const int _ortho_strategy; + const int _scratch_pad_level; + KrylovHandleType _handle; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGMRES( + const ValuesViewType &D, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int N_team, + const int team_size, const int vector_length, const int N_iteration, + const double tol, const int ortho_strategy, const int scratch_pad_level, + KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + _team_size(team_size), + _vector_length(vector_length), + _N_iteration(N_iteration), + _tol(tol), + _ortho_strategy(ortho_strategy), + _scratch_pad_level(scratch_pad_level), + _handle(handle) {} + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGMRES( + const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, + const IntView &c, const VectorViewType &X, const VectorViewType &B, + const int N_team, const int team_size, const int vector_length, + const int N_iteration, const double tol, int ortho_strategy, + const int scratch_pad_level, KrylovHandleType &handle) + : _D(D), + _diag(diag), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + _team_size(team_size), + _vector_length(vector_length), + _N_iteration(N_iteration), + _tol(tol), + _ortho_strategy(ortho_strategy), + _scratch_pad_level(scratch_pad_level), + _handle(handle) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int first_matrix = static_cast(member.league_rank()) * _N_team; + const int N = _D.extent(0); + const int last_matrix = + (static_cast(member.league_rank() + 1) * _N_team < N + ? static_cast(member.league_rank() + 1) * _N_team + : N); + using TeamVectorCopy1D = + KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + using ScratchPadIntViewType = + Kokkos::View; + using ScratchPadValuesViewType = Kokkos::View< + typename ValuesViewType::non_const_value_type **, + typename ValuesViewType::array_layout, + typename ValuesViewType::execution_space::scratch_memory_space>; + + using Operator = + KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), + _r.extent(0) + _c.extent(0)); + + auto r = + Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview( + tmp_1D_int, + Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + + TeamVectorCopy1D::invoke(member, _r, r); + TeamVectorCopy1D::invoke(member, _c, c); + Operator A(d, r, c); + + if (UsePrec) { + ScratchPadValuesViewType diag( + member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + using PrecOperator = KokkosBatched::JacobiPrec; + + KokkosBatched::TeamVectorCopy::invoke( + member, + Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL), + diag); + PrecOperator P(diag); + P.setComputedInverse(); + + KokkosBatched::TeamVectorGMRES::template invoke< + Operator, VectorViewType, PrecOperator, KrylovHandleType>( + member, A, b, x, P, _handle); + } else { + KokkosBatched::TeamVectorGMRES::template invoke< + Operator, VectorViewType>(member, A, b, x, _handle); + } + } + + inline double run() { + typedef typename ValuesViewType::value_type value_type; + std::string name("KokkosBatched::Test::TeamVectorGMRES"); + Kokkos::Timer timer; + Kokkos::Profiling::pushRegion(name.c_str()); + + Kokkos::TeamPolicy auto_policy( + ceil(1. * _D.extent(0) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy( + ceil(1. * _D.extent(0) / _N_team), _team_size, _vector_length); + Kokkos::TeamPolicy policy; + + if (_team_size < 1) + policy = auto_policy; + else + policy = tuned_policy; + + _handle.set_max_iteration(_N_iteration); + _handle.set_tolerance(_tol); + _handle.set_ortho_strategy(_ortho_strategy); + _handle.set_scratch_pad_level(_scratch_pad_level); + _handle.set_compute_last_residual(true); + + int maximum_iteration = _handle.get_max_iteration(); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + + using ViewType1D = Kokkos::View; + using ViewType2D = Kokkos::View; + using ViewType3D = Kokkos::View; + + size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); + size_t bytes_3D_1 = + ViewType3D::shmem_size(_N_team, _X.extent(1), maximum_iteration); + size_t bytes_3D_2 = ViewType3D::shmem_size(_N_team, maximum_iteration + 1, + maximum_iteration); + size_t bytes_3D_3 = ViewType3D::shmem_size(_N_team, 2, maximum_iteration); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; + + policy.set_scratch_size( + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + + exec_space().fence(); + timer.reset(); + Kokkos::parallel_for(name.c_str(), policy, *this); + exec_space().fence(); + double sec = timer.seconds(); + + return sec; + } +}; + +int main(int /*argc*/, char ** /*argv*/) { + Kokkos::initialize(); + { + using layout = Kokkos::LayoutLeft; + + using IntView = Kokkos::View; + using AMatrixValueView = Kokkos::View; + using XYType = Kokkos::View; + + std::string name_A = "mat.mm"; + std::string name_B = "rhs.mm"; + + int N, Blk, nnz, ncols; + + Blk = 10; + N = 100; + nnz = (Blk - 2) * 3 + 2 * 2; + + IntView rowOffsets("rowOffsets", Blk + 1); + IntView colIndices("colIndices", nnz); + AMatrixValueView values("values", N, nnz); + AMatrixValueView diag("diag", N, Blk); + XYType x("x", N, Blk); + XYType y("y", N, Blk); + + printf("N = %d, Blk = %d, nnz = %d\n", N, Blk, nnz); + + create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, + values, x, y); + + // Replace y by ones: + Kokkos::deep_copy(y, 1.); + + // Replace x by zeros: + // Kokkos::deep_copy(x, 0.); + + getInvDiagFromCRS(values, rowOffsets, colIndices, diag); + + using ScalarType = typename AMatrixValueView::non_const_value_type; + using Layout = typename AMatrixValueView::array_layout; + using EXSP = typename AMatrixValueView::execution_space; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = Kokkos::View; + + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KokkosBatched::KrylovHandle; + + const int N_team = 2; + const int n_iterations = 150; + + const int team_size = -1; + const int vector_length = -1; + const double tol = 1e-8; + const int ortho_strategy = 0; + + KrylovHandleType handle(N, N_team, n_iterations, true); + handle.Arnoldi_view = + Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + + double time = + Functor_TestBatchedTeamVectorGMRES( + values, diag, rowOffsets, colIndices, x, y, N_team, team_size, + vector_length, n_iterations, tol, ortho_strategy, 0, handle) + .run(); + + printf("times = %f secondes\n", time); + + for (int i = 0; i < N; ++i) { + if (handle.is_converged_host(i)) { + std::cout + << "System " << i << " converged in " + << handle.get_iteration_host(i) + << " iterations, the initial absolute norm of the residual was " + << handle.get_norm_host(i, 0) << " and is now " + << handle.get_last_norm_host(i) << std::endl; + } else { + std::cout + << "System " << i << " did not converge in " + << handle.get_max_iteration() + << " iterations, the initial absolute norm of the residual was " + << handle.get_norm_host(i, 0) << " and is now " + << handle.get_last_norm_host(i) << std::endl; + } + } + if (handle.is_converged_host()) + std::cout << "All the systems have converged." << std::endl; + else + std::cout << "There is at least one system that did not converge." + << std::endl; + } + Kokkos::finalize(); +} diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 2b523e1e5f..0d2eb7f395 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -718,6 +718,17 @@ KOKKOS_INLINE_FUNCTION iMatrix = iTemp / numRows; } +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value, + void>::type + getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, + const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { + iRow = iTemp / numMatrices; + iMatrix = iTemp % numMatrices; +} + template KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) { constexpr int rank = 2; diff --git a/src/batched/dense/KokkosBatched_Copy_Decl.hpp b/src/batched/dense/KokkosBatched_Copy_Decl.hpp index c12c8d7209..af240c7d8b 100644 --- a/src/batched/dense/KokkosBatched_Copy_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Copy_Decl.hpp @@ -11,7 +11,7 @@ namespace KokkosBatched { /// Serial Copy /// -template +template struct SerialCopy { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 7e21019f94..0cad2c6c80 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -30,9 +30,17 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamVectorGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -60,9 +68,17 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamVectorGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index f4054030a3..419698a24e 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -28,6 +28,20 @@ struct TeamVectorGemvInternal { assert(false && "Error: encounter dummy impl"); return 0; } + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType & /*member*/, const int /*N*/, const int /*m*/, + const int /*n*/, const ScalarType /*alpha*/, + const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, + const int /*as1*/, const int /*as2*/, + const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, + const int /*xs1*/, const ScalarType /*beta*/, + /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, + const int /*ys1*/) { + assert(false && "Error: encounter dummy impl"); + return 0; + } }; template <> @@ -69,6 +83,55 @@ TeamVectorGemvInternal::invoke( return 0; } +template <> +template +KOKKOS_INLINE_FUNCTION int +TeamVectorGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + const ScalarType one(1.0), zero(0.0); + + // y_l = beta y_l + alpha A_l x_l for l in range(0, N) + // y_l (m), A_l(m x n), B_l(n) + + if (beta == zero) + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); + else if (beta != one) + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) + t += A[as0 * iMatrix + as1 * iRow + as2 * i] * + X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); + } + return 0; +} + } // namespace KokkosBatched #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 73ee2b9ad3..d32232524a 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -30,9 +30,17 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -60,9 +68,17 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 98415cd034..8315a59ce6 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -24,6 +24,14 @@ struct TeamGemvInternal { const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT y, const int ys0); + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); }; template <> @@ -105,6 +113,54 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( return 0; } + +template <> +template +KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + const ScalarType one(1.0), zero(0.0); + + // y_l = beta y_l + alpha A_l x_l for l in range(0, N) + // y_l (m), A_l(m x n), B_l(n) + + if (beta == zero) + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); + else if (beta != one) + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) + t += A[as0 * iMatrix + as1 * iRow + as2 * i] * + X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); + } + return 0; +} } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/KokkosBatched_CG.hpp b/src/batched/sparse/KokkosBatched_CG.hpp index e1e6b5d6a4..7fa1f7e04b 100644 --- a/src/batched/sparse/KokkosBatched_CG.hpp +++ b/src/batched/sparse/KokkosBatched_CG.hpp @@ -68,12 +68,13 @@ namespace KokkosBatched { template struct CG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const OperatorType &A, const VectorViewType &B, - const VectorViewType &X, - const KrylovHandle - &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const OperatorType &A, + const VectorViewType &B, + const VectorViewType &X, + const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { status = diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp index 5448c4684c..1d3edcd343 100644 --- a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp +++ b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp @@ -104,89 +104,37 @@ class CrsMatrix { /// \param beta [in]: input coefficient for Y (default value 0.) /// \param Y [in/out]: Output vector Y, a rank 2 view - template + template KOKKOS_INLINE_FUNCTION void apply( const MemberType &member, const XViewType &X, const YViewType &Y, MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), MagnitudeType beta = Kokkos::Details::ArithTraits::zero()) const { if (beta == 0) - KokkosBatched::Spmv::template invoke< + KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::Spmv::template invoke< + KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 1>( member, alpha, values, row_ptr, colIndices, X, beta, Y); } - /// \brief apply version that uses variable coefficient alpha and no beta - /// y_l <- alpha_l * A_l * x_l for all l = 1, ..., N - /// where: - /// * N is the number of matrices, - /// * A_1, ..., A_N are N sparse matrices which share the same sparsity - /// pattern, - /// * x_1, ..., x_N are the N input vectors, - /// * y_1, ..., y_N are the N output vectors, - /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N. - /// - /// \tparam MemberType: Input type for the TeamPolicy member - /// \tparam XViewType: Input type for X, needs to be a 2D view - /// \tparam YViewType: Input type for Y, needs to be a 2D view - /// \tparam ArgTrans: Argument for transpose or notranspose - /// \tparam ArgMode: Argument for the parallelism used in the apply - /// - /// \param member [in]: TeamPolicy member - /// \param alpha [in]: input coefficient for X, a rank 1 view - /// \param X [in]: Input vector X, a rank 2 view - /// \param Y [out]: Output vector Y, a rank 2 view - - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, const YViewType &Y, - NormViewType alpha) const { - KokkosBatched::Spmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, NormViewType, - NormViewType, 0>(member, alpha, values, row_ptr, colIndices, X, alpha, - Y); - } - - /// \brief apply version that uses variable coefficients alpha and beta - /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N - /// where: - /// * N is the number of matrices, - /// * A_1, ..., A_N are N sparse matrices which share the same sparsity - /// pattern, - /// * x_1, ..., x_N are the N input vectors, - /// * y_1, ..., y_N are the N output vectors, - /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, - /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. - /// - /// \tparam MemberType: Input type for the TeamPolicy member - /// \tparam XViewType: Input type for X, needs to be a 2D view - /// \tparam YViewType: Input type for Y, needs to be a 2D view - /// \tparam NormViewType: Input type for alpha and beta, needs to be a 1D view - /// \tparam ArgTrans: Argument for transpose or notranspose - /// \tparam ArgMode: Argument for the parallelism used in the apply - /// - /// \param member [in]: TeamPolicy member - /// \param alpha [in]: input coefficient for X, a rank 1 view - /// \param X [in]: Input vector X, a rank 2 view - /// \param beta [in]: input coefficient for Y, a rank 1 view - /// \param Y [in/out]: Output vector Y, a rank 2 view - - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, const YViewType &Y, - const NormViewType &alpha, - const NormViewType &beta) const { - KokkosBatched::Spmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, NormViewType, - NormViewType, 1>(member, alpha, values, row_ptr, colIndices, X, beta, - Y); + template + KOKKOS_INLINE_FUNCTION void apply( + const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), + MagnitudeType beta = + Kokkos::Details::ArithTraits::zero()) const { + if (beta == 0) + KokkosBatched::SerialSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 0>( + alpha, values, row_ptr, colIndices, X, beta, Y); + else + KokkosBatched::SerialSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 1>( + alpha, values, row_ptr, colIndices, X, beta, Y); } }; diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp index 512970006b..5a7a8a7749 100644 --- a/src/batched/sparse/KokkosBatched_GMRES.hpp +++ b/src/batched/sparse/KokkosBatched_GMRES.hpp @@ -61,6 +61,7 @@ /// the tolerance or the maximal number of iterations of the solver. #include "KokkosBatched_Krylov_Handle.hpp" +#include "KokkosBatched_GMRES_Serial_Impl.hpp" #include "KokkosBatched_GMRES_Team_Impl.hpp" #include "KokkosBatched_GMRES_TeamVector_Impl.hpp" @@ -68,14 +69,18 @@ namespace KokkosBatched { template struct GMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const OperatorType &A, const VectorViewType &B, - const VectorViewType &X, - const KrylovHandle - &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const OperatorType &A, + const VectorViewType &B, + const VectorViewType &X, + const KrylovHandleType &handle) { int status = 0; - if (std::is_same::value) { + if (std::is_same::value) { + status = SerialGMRES::template invoke( + A, B, X, handle); + } else if (std::is_same::value) { status = TeamGMRES::template invoke( member, A, B, X, handle); diff --git a/src/batched/sparse/KokkosBatched_Identity.hpp b/src/batched/sparse/KokkosBatched_Identity.hpp index 57934df66a..6613bdd1ec 100644 --- a/src/batched/sparse/KokkosBatched_Identity.hpp +++ b/src/batched/sparse/KokkosBatched_Identity.hpp @@ -60,8 +60,8 @@ class Identity { KOKKOS_INLINE_FUNCTION ~Identity() {} - template + template KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { @@ -76,6 +76,14 @@ class Identity { } } } + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, + const YViewType &Y) const { + if (sameXY == 0) { + SerialCopy::invoke(X, Y); + } + } }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp index 129378ed43..e4bfbefd0f 100644 --- a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp +++ b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp @@ -77,6 +77,8 @@ class JacobiPrec { KOKKOS_INLINE_FUNCTION ~JacobiPrec() {} + KOKKOS_INLINE_FUNCTION void setComputedInverse() { computed_inverse = true; } + template KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const { auto one = Kokkos::Details::ArithTraits::one(); @@ -141,8 +143,30 @@ class JacobiPrec { computed_inverse = true; } - template + KOKKOS_INLINE_FUNCTION void computeInverse() const { + auto one = Kokkos::Details::ArithTraits::one(); + auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + int tooSmall = 0; + + for (int i = 0; i < n_operators; ++i) + for (int j = 0; j < n_colums; ++j) { + if (Kokkos::abs(diag_values(i, j)) <= epsilon) { + ++tooSmall; + diag_values(i, j) = one; + } else + diag_values(i, j) = one / diag_values(i, j); + } + + if (tooSmall > 0) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); + computed_inverse = true; + } + + template KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { @@ -154,6 +178,19 @@ class JacobiPrec { KokkosBatched::HadamardProduct::template invoke< ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y); } + + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, + const YViewType &Y) const { + if (!computed_inverse) { + this->computeInverse(); + } + + KokkosBatched::SerialHadamardProduct::template invoke( + diag_values, X, Y); + } }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp index f14eac7065..1faabcc993 100644 --- a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp +++ b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp @@ -56,21 +56,154 @@ namespace KokkosBatched { /// /// \tparam scalar_type: Scalar type of the linear solver -template +template class KrylovHandle { public: - using norm_type = - typename Kokkos::Details::ArithTraits::mag_type; + using norm_type = typename NormViewType::non_const_value_type; + + typedef ViewType3D ArnoldiViewType; + typedef Kokkos::View + TemporaryViewType; + + public: + NormViewType residual_norms; + IntViewType iteration_numbers; + typename NormViewType::HostMirror residual_norms_host; + typename IntViewType::HostMirror iteration_numbers_host; + IntViewType first_index; + IntViewType last_index; + ArnoldiViewType Arnoldi_view; + TemporaryViewType tmp_view; private: norm_type tolerance; + norm_type max_tolerance; int max_iteration; + int batched_size; + int N_team; + int ortho_strategy; + int scratch_pad_level; + bool compute_last_residual; + bool monitor_residual; + bool host_synchronised; public: - KOKKOS_INLINE_FUNCTION - KrylovHandle() { + KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, + bool _monitor_residual = false) + : max_iteration(_max_iteration), + batched_size(_batched_size), + N_team(_N_team), + monitor_residual(_monitor_residual) { tolerance = Kokkos::Details::ArithTraits::epsilon(); - max_iteration = 200; + max_tolerance = 1e-30; + if (std::is_same::value) max_tolerance = 1e-50; + if (monitor_residual) { + residual_norms = NormViewType("", batched_size, max_iteration + 2); + } + iteration_numbers = IntViewType("", batched_size); + Kokkos::deep_copy(iteration_numbers, -1); + + int n_teams = ceil(1. * batched_size / N_team); + first_index = IntViewType("", n_teams); + last_index = IntViewType("", n_teams); + + auto first_index_host = Kokkos::create_mirror_view(first_index); + auto last_index_host = Kokkos::create_mirror_view(last_index); + + first_index_host(0) = 0; + last_index_host(0) = N_team; + for (int i = 1; i < n_teams; ++i) { + first_index_host(i) = last_index_host(i - 1); + last_index_host(i) = first_index_host(i) + N_team; + } + last_index_host(n_teams - 1) = batched_size; + + Kokkos::deep_copy(first_index, first_index_host); + Kokkos::deep_copy(last_index, last_index_host); + + // Default Classical GS + ortho_strategy = 1; + scratch_pad_level = 0; + compute_last_residual = true; + host_synchronised = false; + } + + /// \brief reset + /// Reset the iteration numbers to the default value of -1 + /// and the residual norms if monitored. + /// (Usefull when mulitple consecutive solvers use the same handle) + /// + + void reset() { + Kokkos::deep_copy(iteration_numbers, -1); + if (monitor_residual) { + Kokkos::deep_copy(residual_norms, 0.); + } + host_synchronised = false; + } + + /// + + void synchronise_host() { + iteration_numbers_host = Kokkos::create_mirror_view(iteration_numbers); + Kokkos::deep_copy(iteration_numbers_host, iteration_numbers); + if (monitor_residual) { + residual_norms_host = Kokkos::create_mirror_view(residual_norms); + Kokkos::deep_copy(residual_norms_host, residual_norms); + } + host_synchronised = true; + } + + /// \brief is_converged + /// Test if all the systems have converged. + /// + + KOKKOS_INLINE_FUNCTION + bool is_converged() const { + bool all_converged = true; + for (size_t i = 0; i < batched_size; ++i) + if (iteration_numbers(i) == -1) { + all_converged = false; + break; + } + return all_converged; + } + + /// \brief is_converged_host + /// Test if all the systems have converged (host). + /// + + bool is_converged_host() { + if (!host_synchronised) this->synchronise_host(); + bool all_converged = true; + for (int i = 0; i < batched_size; ++i) + if (iteration_numbers_host(i) == -1) { + all_converged = false; + break; + } + return all_converged; + } + + /// \brief is_converged + /// Test if one particular system has converged. + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + bool is_converged(int batched_id) const { + return (iteration_numbers(batched_id) != -1); + } + + /// \brief is_converged + /// Test if one particular system has converged (host). + /// + /// \param batched_id [in]: Global batched ID + + bool is_converged_host(int batched_id) { + if (!host_synchronised) this->synchronise_host(); + return (iteration_numbers_host(batched_id) != -1); } /// \brief set_tolerance @@ -87,21 +220,246 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION norm_type get_tolerance() const { return tolerance; } + /// \brief set_max_tolerance + /// Set the maximal tolerance of the batched Krylov solver + /// + /// \param _max_tolerance [in]: New tolerance + + KOKKOS_INLINE_FUNCTION + void set_max_tolerance(norm_type _max_tolerance) { + max_tolerance = _max_tolerance; + } + + /// \brief get_max_tolerance + /// Get the maximal tolerance of the batched Krylov solver + + KOKKOS_INLINE_FUNCTION + norm_type get_max_tolerance() const { return max_tolerance; } + /// \brief set_max_iteration /// Set the maximum number of iterations of the batched Krylov solver /// /// \param _max_iteration [in]: New maximum number of iterations KOKKOS_INLINE_FUNCTION - void set_max_iteration(norm_type _max_iteration) { - max_iteration = _max_iteration; - } + void set_max_iteration(int _max_iteration) { max_iteration = _max_iteration; } /// \brief get_max_iteration /// Get the maximum number of iterations of the batched Krylov solver KOKKOS_INLINE_FUNCTION int get_max_iteration() const { return max_iteration; } + + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int batched_id, int iteration_id, norm_type norm_i) const { + if (monitor_residual) residual_norms(batched_id, iteration_id) = norm_i; + } + + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int team_id, int batched_id, int iteration_id, + norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; + } + + /// \brief get_norm + /// Get the norm of one system at a given iteration + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + norm_type get_norm(int batched_id, int iteration_id) const { + if (monitor_residual) { + return residual_norms(batched_id, iteration_id); + } else + return 0; + } + + /// \brief get_norm_host + /// Get the norm of one system at a given iteration (host) + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + norm_type get_norm_host(int batched_id, int iteration_id) { + if (monitor_residual) { + if (!host_synchronised) this->synchronise_host(); + return residual_norms_host(batched_id, iteration_id); + } else + return 0; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(batched_id, max_iteration + 1) = norm_i; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; + } + + /// \brief get_last_norm + /// Get the last norm of one system + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + norm_type get_last_norm(int batched_id) const { + if (monitor_residual && compute_last_residual) { + return residual_norms(batched_id, max_iteration + 1); + } else + return 0; + } + + /// \brief get_last_norm_host + /// Get the last norm of one system (host) + /// + /// \param batched_id [in]: Global batched ID + + norm_type get_last_norm_host(int batched_id) { + if (monitor_residual && compute_last_residual) { + if (!host_synchronised) this->synchronise_host(); + return residual_norms_host(batched_id, max_iteration + 1); + } else + return 0; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int batched_id, int iteration_id) const { + iteration_numbers(batched_id) = iteration_id; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int team_id, int batched_id, int iteration_id) const { + iteration_numbers(team_id * N_team + batched_id) = iteration_id; + } + + /// \brief get_iteration + /// Get the number of iteration after convergence for one system + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + int get_iteration(int batched_id) const { + return iteration_numbers(batched_id); + } + + /// \brief get_iteration_host + /// Get the number of iteration after convergence for one system (host) + /// + /// \param batched_id [in]: Global batched ID + + int get_iteration_host(int batched_id) { + if (!host_synchronised) this->synchronise_host(); + return iteration_numbers_host(batched_id); + } + + /// \brief set_ortho_strategy + /// Set the used orthogonalization strategy. + /// Either classical GS (_ortho_strategy=0) or modified GS + /// (_ortho_strategy=1) + /// + /// \param _ortho_strategy [in]: used orthogonalization strategy + + KOKKOS_INLINE_FUNCTION + void set_ortho_strategy(int _ortho_strategy) { + ortho_strategy = _ortho_strategy; + } + + /// \brief get_ortho_strategy + /// Get the used orthogonalization strategy. + /// Either classical GS (_ortho_strategy=0) or modified GS + /// (_ortho_strategy=1) + + KOKKOS_INLINE_FUNCTION + int get_ortho_strategy() const { return ortho_strategy; } + + /// \brief set_scratch_pad_level + /// Set the scratch pad level used to store temporary variables. + /// + /// \param _scratch_pad_level [in]: used level + + KOKKOS_INLINE_FUNCTION + void set_scratch_pad_level(int _scratch_pad_level) { + scratch_pad_level = _scratch_pad_level; + } + + /// \brief get_scratch_pad_level + /// Get the scratch pad level used to store temporary variables. + + KOKKOS_INLINE_FUNCTION + int get_scratch_pad_level() const { return scratch_pad_level; } + + /// \brief set_compute_last_residual + /// Select if the last residual is explicitly computed. + /// + /// \param _compute_last_residual [in]: boolean that specifies if we compute + /// the last residual explicitly + + KOKKOS_INLINE_FUNCTION + void set_compute_last_residual(bool _compute_last_residual) { + if (monitor_residual) + compute_last_residual = _compute_last_residual; + else + compute_last_residual = false; + } + + /// \brief get_compute_last_residual + /// Specify if the last residual has to be computed explicitly. + + KOKKOS_INLINE_FUNCTION + bool get_compute_last_residual() const { + if (monitor_residual) + return compute_last_residual; + else + return false; + } }; } // namespace KokkosBatched diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 83e8fb90ed..f32c02417c 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -62,12 +62,13 @@ namespace KokkosBatched { template struct TeamVectorCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -87,16 +88,29 @@ struct TeamVectorCG { const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices); - ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices); - ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); TeamVectorCopy::invoke(member, _X, X); // Deep copy of b into r_0: @@ -104,9 +118,7 @@ struct TeamVectorCG { // r_0 := b - A x_0 member.team_barrier(); - A.template apply(member, X, R, -1, 1); + A.template apply(member, X, R, -1, 1); member.team_barrier(); // Deep copy of r_0 into p_0: @@ -128,9 +140,7 @@ struct TeamVectorCG { for (size_t j = 0; j < maximum_iteration; ++j) { // q := A p_j - A.template apply(member, P, Q); + A.template apply(member, P, Q); member.team_barrier(); TeamVectorDot::invoke(member, P, Q, tmp); diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 2bc611aa32..02328aaf1a 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -61,12 +61,13 @@ namespace KokkosBatched { template struct TeamCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandle& handle) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -86,16 +87,29 @@ struct TeamCG { const OrdinalType numMatrices = _X.extent(0); const OrdinalType numRows = _X.extent(1); - ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices); - ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices); - ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); TeamCopy::invoke(member, _X, X); // Deep copy of b into r_0: @@ -103,9 +117,7 @@ struct TeamCG { // r_0 := b - A x_0 member.team_barrier(); - A.template apply( - member, X, R, -1, 1); + A.template apply(member, X, R, -1, 1); member.team_barrier(); // Deep copy of r_0 into p_0: @@ -127,9 +139,7 @@ struct TeamCG { for (size_t j = 0; j < maximum_iteration; ++j) { // q := A p_j - A.template apply(member, P, Q); + A.template apply(member, P, Q); member.team_barrier(); TeamDot::invoke(member, P, Q, tmp); diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp new file mode 100644 index 0000000000..db6accce2f --- /dev/null +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -0,0 +1,333 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Copy_Decl.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Spmv.hpp" +#include "KokkosBatched_Xpay.hpp" +#include "KokkosBatched_Givens_Serial_Internal.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +namespace KokkosBatched { + +/// +/// Serial GMRES +/// + +struct SerialGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const int GMRES_id) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using SerialCopy1D = SerialCopy; + using SerialCopy2D = SerialCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(GMRES_id); + const int last_matrix = handle.last_index(GMRES_id); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_mask = offset_W + n_W; + int offset_tmp = offset_mask + n_mask; + + auto G = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto mask = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + offset_mask); + auto tmp = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + offset_tmp); + + // Deep copy of b into r_0: + SerialCopy2D::invoke(_B, W); + + // r_0 := b - A x_0 + A.template apply(_X, W, -1, 1); + + P.template apply(W, W); + + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(GMRES_id, i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(GMRES_id, i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + } + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + } + } + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(V_j, W); + + P.template apply(W, W); + + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + auto W_l = Kokkos::subview(W, l, Kokkos::ALL); + auto V_old = Kokkos::subview( + V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = + Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); + + // Inner products + SerialGemv::invoke( + 1, V_old, W_l, 0, H_old); + + // Update + SerialGemv::invoke( + -1, V_old, H_old, 1, W_l); + } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + SerialDot::invoke(W, V_i, tmp); + SerialCopy1D::invoke(tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); + for (OrdinalType ii = 0; ii < numMatrices; ++ii) tmp(ii) = -tmp(ii); + + SerialAxpy::invoke(tmp, V_i, W); + } + } + + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = + H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + } + + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + } + } + } + + for (OrdinalType l = 0; l < numMatrices; ++l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(GMRES_id, l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(GMRES_id, l, j + 1); + } + } + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } + + for (OrdinalType l = 0; l < numMatrices; ++l) { + for (size_t i = 0; i < maximum_iteration; ++i) { + size_t row_i = maximum_iteration - 1 - i; + for (size_t j = row_i + 1; j < maximum_iteration; ++j) + G(l, row_i) -= H_view(l, j, row_i) * G(l, j); + G(l, row_i) /= H_view(l, row_i, row_i); + } + } + + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + SerialGemv::invoke( + 1, + Kokkos::subview(V_view, l, + Kokkos::make_pair(0, (int)maximum_iteration), + Kokkos::ALL), + Kokkos::subview(G, l, Kokkos::make_pair(0, (int)maximum_iteration)), + 1, Kokkos::subview(_X, l, Kokkos::ALL)); + } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), + _X); + } + } + + if (handle.get_compute_last_residual()) { + SerialCopy2D::invoke(_B, W); + A.template apply(_X, W, -1, 1); + P.template apply(W, W); + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(GMRES_id, i, tmp(i)); + } + } + return status; + } + + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { + Identity P; + return invoke(A, _B, _X, P, handle); + } +}; +} // namespace KokkosBatched + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 8e45b97556..a95b712cbb 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -54,6 +54,7 @@ #include "KokkosBatched_Givens_Serial_Internal.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" namespace KokkosBatched { @@ -66,12 +67,13 @@ namespace KokkosBatched { template struct TeamVectorGMRES { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandle& - handle) { + typename PrecOperatorType, typename KrylovHandleType> + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -97,132 +99,185 @@ struct TeamVectorGMRES { ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = 0.; - - ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices, - maximum_iteration + 1, numRows); - ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices, - maximum_iteration + 1, maximum_iteration); - ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices, - maximum_iteration, 2); - ScratchPadVectorViewType G(member.team_scratch(1), numMatrices, - maximum_iteration + 1); - - ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType beta(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); TeamVectorCopy::invoke(member, _X, X); // Deep copy of b into r_0: - TeamVectorCopy::invoke(member, _B, R); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { mask(i) = 1.; }); + TeamVectorCopy::invoke(member, _B, W); // r_0 := b - A x_0 member.team_barrier(); - A.template apply(member, X, R, -1, 1); + A.template apply(member, X, W, -1, 1); member.team_barrier(); - P.template apply(member, R, R); + P.template apply(member, W, W); member.team_barrier(); - TeamVectorDot::invoke(member, R, R, beta); + TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { - beta(i) = ATM::sqrt(beta(i)); - G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.; - tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.; + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } }); member.team_barrier(); // Finish writing to tmp + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); Kokkos::parallel_for( Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { OrdinalType iRow, iMatrix; getIndices( iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); }); - int status = 1; // int number_not_converged = 0; for (size_t j = 0; j < maximum_iteration; ++j) { member.team_barrier(); // Finish writing to V // q := A p_j - auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL); + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); - A.template apply(member, V_j, W); + A.template apply(member, V_j, W); member.team_barrier(); - P.template apply(member, W, W); - - for (size_t i = 0; i < j + 1; ++i) { - member.team_barrier(); // Finish writing to W - auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL); - TeamVectorDot::invoke(member, W, V_i, tmp); - member.team_barrier(); - TeamVectorCopy1D::invoke(member, tmp, - Kokkos::subview(H, Kokkos::ALL, i, j)); - member.team_barrier(); // Don't start modifying tmp until copy above - // finishes - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + P.template apply(member, W, W); + member.team_barrier(); - member.team_barrier(); // Finish writing to tmp + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); + member.team_barrier(); + // Inner products + TeamVectorGemv::invoke(member, 1, V_old, W, 0, + H_old); + member.team_barrier(); - TeamVectorAxpy::invoke(member, tmp, V_i, W); + // Update + TeamVectorGemv::invoke(member, -1, V_old, H_old, + 1, W); + member.team_barrier(); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + TeamVectorDot::invoke(member, W, V_i, tmp); + member.team_barrier(); + TeamVectorCopy1D::invoke(member, tmp, + Kokkos::subview(H_view, Kokkos::ALL, j, i)); + member.team_barrier(); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + + member.team_barrier(); // Finish writing to tmp + + TeamVectorAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W + } } member.team_barrier(); // Finish writing to W TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H(i, j + 1, j) = ATM::sqrt(tmp(i)); - tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + member.team_barrier(); + } Kokkos::parallel_for( Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j); + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); if (mask(l) == 1.) { for (size_t i = 0; i < j; ++i) { - auto tmp1 = - Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1); + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); auto tmp2 = - -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1); + -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); H_j(i) = tmp1; H_j(i + 1) = tmp2; } @@ -234,68 +289,112 @@ struct TeamVectorGMRES { typename VectorViewType::non_const_value_type alpha = 0; SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - Givens(l, j, 0) = G_new.first; - Givens(l, j, 1) = G_new.second; + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; // Apply the new Givens rotation: - auto tmp1 = - Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1); - auto tmp2 = - -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1); + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); H_j(j) = tmp1; H_j(j + 1) = tmp2; - G(l, j + 1) = -Givens(l, j, 1) * G(l, j); - G(l, j) *= Givens(l, j, 0); + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); } else { H_j(j) = 1.; G(l, j + 1) = 0.; } - if (mask(l) == 1. && - Kokkos::ArithTraits::abs(G(l, j + 1)) / beta(l) < - tolerance) { + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { mask(l) = 0.; G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); } }); + member.team_barrier(); + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } } member.team_barrier(); // Finish writing to G - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - SerialTrsm::template invoke(1, - Kokkos::subview( - H, l, - Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview( - G, l, - Kokkos::ALL)); - }); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + for (size_t i = 0; i < maximum_iteration; ++i) { + size_t row_i = maximum_iteration - 1 - i; + for (size_t j = row_i + 1; j < maximum_iteration; + ++j) + G(l, row_i) -= H_view(l, j, row_i) * G(l, j); + G(l, row_i) /= H_view(l, row_i, row_i); + } + }); member.team_barrier(); // Finish writing to G - for (size_t j = 0; j < maximum_iteration; ++j) { - TeamVectorAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X + if (handle.get_ortho_strategy() == 0) { + TeamVectorGemv:: + invoke(member, 1, + Kokkos::subview(V_view, Kokkos::ALL, + Kokkos::make_pair(0, (int)maximum_iteration), + Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, + Kokkos::make_pair(0, (int)maximum_iteration)), + 1, X); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + TeamVectorAxpy::invoke( + member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); + member.team_barrier(); // Finish writing to X + } } + member.team_barrier(); // Finish writing to X + TeamVectorCopy::invoke(member, X, _X); + + member.team_barrier(); + + if (handle.get_compute_last_residual()) { + TeamVectorCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, + 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); + } return status; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; return invoke(member, A, _B, _X, P, handle); diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 4b4bd06bc0..58d136e69c 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -54,6 +54,7 @@ #include "KokkosBatched_Givens_Serial_Internal.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" namespace KokkosBatched { @@ -65,12 +66,13 @@ namespace KokkosBatched { template struct TeamGMRES { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandle& - handle) { + typename PrecOperatorType, typename KrylovHandleType> + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle) { typedef int OrdinalType; typedef typename Kokkos::Details::ArithTraits< typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; @@ -96,130 +98,183 @@ struct TeamGMRES { ? handle.get_max_iteration() : numRows; const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = 0.; - - ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices, - maximum_iteration + 1, numRows); - ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices, - maximum_iteration + 1, maximum_iteration); - ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices, - maximum_iteration, 2); - ScratchPadVectorViewType G(member.team_scratch(1), numMatrices, - maximum_iteration + 1); - - ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType beta(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, + Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); TeamCopy::invoke(member, _X, X); // Deep copy of b into r_0: - TeamCopy::invoke(member, _B, R); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { mask(i) = 1.; }); + TeamCopy::invoke(member, _B, W); // r_0 := b - A x_0 member.team_barrier(); - A.template apply( - member, X, R, -1, 1); + A.template apply(member, X, W, -1, 1); member.team_barrier(); - P.template apply(member, R, R); + P.template apply(member, W, W); member.team_barrier(); - TeamDot::invoke(member, R, R, beta); + TeamDot::invoke(member, W, W, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { - beta(i) = ATM::sqrt(beta(i)); - G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.; - tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.; + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } }); member.team_barrier(); // Finish writing to tmp + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { OrdinalType iRow, iMatrix; getIndices( iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); }); - int status = 1; // int number_not_converged = 0; for (size_t j = 0; j < maximum_iteration; ++j) { member.team_barrier(); // Finish writing to V // q := A p_j - auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL); + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); - A.template apply(member, V_j, W); + A.template apply(member, V_j, W); member.team_barrier(); - P.template apply(member, W, W); - - for (size_t i = 0; i < j + 1; ++i) { - member.team_barrier(); // Finish writing to W - auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL); - TeamDot::invoke(member, W, V_i, tmp); - member.team_barrier(); - TeamCopy1D::invoke(member, tmp, Kokkos::subview(H, Kokkos::ALL, i, j)); - member.team_barrier(); // Don't start modifying tmp until copy above - // finishes - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); - member.team_barrier(); // Finish writing to tmp + P.template apply(member, W, W); + member.team_barrier(); - TeamAxpy::invoke(member, tmp, V_i, W); + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); + member.team_barrier(); + // Inner products + TeamGemv::invoke( + member, 1, V_old, W, 0, H_old); + member.team_barrier(); + + // Update + TeamGemv::invoke( + member, -1, V_old, H_old, 1, W); + member.team_barrier(); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + TeamDot::invoke(member, W, V_i, tmp); + member.team_barrier(); + TeamCopy1D::invoke(member, tmp, + Kokkos::subview(H_view, Kokkos::ALL, j, i)); + member.team_barrier(); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); + + member.team_barrier(); // Finish writing to tmp + + TeamAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W + } } member.team_barrier(); // Finish writing to W TeamDot::invoke(member, W, W, tmp); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H(i, j + 1, j) = ATM::sqrt(tmp(i)); - tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + member.team_barrier(); + } Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& l) { // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j); + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); if (mask(l) == 1.) { for (size_t i = 0; i < j; ++i) { - auto tmp1 = - Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1); + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); auto tmp2 = - -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1); + -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); H_j(i) = tmp1; H_j(i + 1) = tmp2; } @@ -231,68 +286,111 @@ struct TeamGMRES { typename VectorViewType::non_const_value_type alpha = 0; SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - Givens(l, j, 0) = G_new.first; - Givens(l, j, 1) = G_new.second; + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; // Apply the new Givens rotation: - auto tmp1 = - Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1); - auto tmp2 = - -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1); + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); H_j(j) = tmp1; H_j(j + 1) = tmp2; - G(l, j + 1) = -Givens(l, j, 1) * G(l, j); - G(l, j) *= Givens(l, j, 0); + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); } else { H_j(j) = 1.; G(l, j + 1) = 0.; } - if (mask(l) == 1. && - Kokkos::ArithTraits::abs(G(l, j + 1)) / beta(l) < - tolerance) { + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { mask(l) = 0.; G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); } }); + member.team_barrier(); + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } } member.team_barrier(); // Finish writing to G - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - SerialTrsm::template invoke(1, - Kokkos::subview( - H, l, - Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview( - G, l, - Kokkos::ALL)); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + for (size_t i = 0; i < maximum_iteration; ++i) { + size_t row_i = maximum_iteration - 1 - i; + for (size_t j = row_i + 1; j < maximum_iteration; + ++j) + G(l, row_i) -= H_view(l, j, row_i) * G(l, j); + G(l, row_i) /= H_view(l, row_i, row_i); + } + }); member.team_barrier(); // Finish writing to G - for (size_t j = 0; j < maximum_iteration; ++j) { - TeamAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X + if (handle.get_ortho_strategy() == 0) { + TeamGemv::invoke( + member, 1, + Kokkos::subview(V_view, Kokkos::ALL, + Kokkos::make_pair(0, (int)maximum_iteration), + Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, + Kokkos::make_pair(0, (int)maximum_iteration)), + 1, X); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + TeamAxpy::invoke( + member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); + member.team_barrier(); // Finish writing to X + } } + member.team_barrier(); // Finish writing to X + TeamCopy::invoke(member, X, _X); + + member.team_barrier(); + + if (handle.get_compute_last_residual()) { + TeamCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); + } return status; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { Identity P; return invoke(member, A, _B, _X, P, handle); diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp new file mode 100644 index 0000000000..15af38bef5 --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp @@ -0,0 +1,239 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosBatched_GMRES.hpp" +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosBatched_CrsMatrix.hpp" +#include "Test_Batched_SparseUtils.hpp" +#include "KokkosBatched_JacobiPrec.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace GMRES { + +template +struct Functor_TestBatchedSerialGMRES { + const ValuesViewType _D; + const IntView _r; + const IntView _c; + const VectorViewType _X; + const VectorViewType _B; + const VectorViewType _Diag; + const int _N_team; + KrylovHandleType _handle; + + Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, + const IntView &c, const VectorViewType &X, + const VectorViewType &B, + const VectorViewType &diag, const int N_team, + KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + _Diag(diag), + _handle(handle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const int k) const { + const int first_matrix = _handle.first_index(k); + const int last_matrix = _handle.last_index(k); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto diag = Kokkos::subview( + _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + using Operator = KokkosBatched::CrsMatrix; + using PrecOperator = KokkosBatched::JacobiPrec; + + Operator A(d, _r, _c); + PrecOperator P(diag); + P.setComputedInverse(); + + KokkosBatched::SerialGMRES::template invoke( + A, b, x, P, _handle, k); + } + + inline void run() { + typedef typename ValuesViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGMRES"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); + + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); + + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( + "", N, n + maximum_iteration + 3); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { + typedef typename ValuesViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + const int nnz = (BlkSize - 2) * 3 + 2 * 2; + + VectorViewType X("x0", N, BlkSize); + VectorViewType R("r0", N, BlkSize); + VectorViewType B("b", N, BlkSize); + ValuesViewType D("D", N, nnz); + ValuesViewType Diag("Diag", N, BlkSize); + IntView r("r", BlkSize + 1); + IntView c("c", nnz); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = Kokkos::View; + + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + + NormViewType sqr_norm_0("sqr_norm_0", N); + NormViewType sqr_norm_j("sqr_norm_j", N); + + create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B); + + { + auto diag_values_host = Kokkos::create_mirror_view(Diag); + auto values_host = Kokkos::create_mirror_view(D); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, D); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) + diag_values_host(j, i) = values_host(j, current_index); + } + + Kokkos::deep_copy(Diag, diag_values_host); + } + + // Compute initial norm + + Kokkos::deep_copy(R, B); + + auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + auto R_host = Kokkos::create_mirror_view(R); + auto X_host = Kokkos::create_mirror_view(X); + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(R, B); + Kokkos::deep_copy(R_host, R); + Kokkos::deep_copy(X_host, X); + + Kokkos::deep_copy(c_host, c); + Kokkos::deep_copy(r_host, r); + Kokkos::deep_copy(D_host, D); + + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + + KokkosBatched::SerialSpmv::template invoke< + typename ValuesViewType::HostMirror, typename IntView::HostMirror, + typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, + 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, + sqr_norm_0_host); + Functor_TestBatchedSerialGMRES( + D, r, c, X, B, Diag, N_team, handle) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(R, B); + Kokkos::deep_copy(R_host, R); + Kokkos::deep_copy(X_host, X); + + KokkosBatched::SerialSpmv::template invoke< + typename ValuesViewType::HostMirror, typename IntView::HostMirror, + typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, + 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e5 * ats::epsilon(); + + for (int l = 0; l < N; ++l) + EXPECT_NEAR_KK( + std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); +} +} // namespace GMRES +} // namespace Test + +template +int test_batched_serial_GMRES() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View + VectorViewType; + + for (int i = 3; i < 10; ++i) { + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View + VectorViewType; + + for (int i = 3; i < 10; ++i) { + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp new file mode 100644 index 0000000000..acaa2f0ed2 --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp @@ -0,0 +1,12 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { + test_batched_serial_GMRES(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { + test_batched_serial_GMRES(); +} +#endif diff --git a/unit_test/batched/sparse/Test_Batched_Sparse.hpp b/unit_test/batched/sparse/Test_Batched_Sparse.hpp index 4b36400d2e..36bfc43528 100644 --- a/unit_test/batched/sparse/Test_Batched_Sparse.hpp +++ b/unit_test/batched/sparse/Test_Batched_Sparse.hpp @@ -2,6 +2,8 @@ #define TEST_BATCHED_SPARSE_HPP // Serial kernels +#include "Test_Batched_SerialGMRES.hpp" +#include "Test_Batched_SerialGMRES_Real.hpp" #include "Test_Batched_SerialSpmv.hpp" #include "Test_Batched_SerialSpmv_Real.hpp" diff --git a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp index 3e606d1508..8cfc76410b 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp @@ -14,7 +14,7 @@ namespace Test { namespace TeamCG { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamCG { const ValuesViewType _D; const IntView _r; @@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamCG { const VectorViewType _X; const VectorViewType _B; const int _N_team; - KrylovHandle handle; + KrylovHandleType handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -50,9 +55,7 @@ struct Functor_TestBatchedTeamCG { Operator A(d, _r, _c); - KokkosBatched::TeamCG::template invoke( - member, A, b, x, handle); + KokkosBatched::TeamCG::invoke(member, A, b, x, handle); } inline void run() { @@ -96,6 +99,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -127,8 +137,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); - Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) + Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp index f724553590..1cf2cf0866 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp @@ -15,21 +15,30 @@ namespace Test { namespace TeamGMRES { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamGMRES { const ValuesViewType _D; const IntView _r; const IntView _c; const VectorViewType _X; const VectorViewType _B; + const VectorViewType _Diag; const int _N_team; - KrylovHandle handle; + KrylovHandleType _handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + const VectorViewType &B, + const VectorViewType &diag, const int N_team, + KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + _Diag(diag), + _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -42,18 +51,23 @@ struct Functor_TestBatchedTeamGMRES { auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview( + _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using Operator = KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; + using PrecOperator = KokkosBatched::JacobiPrec; Operator A(d, _r, _c); + PrecOperator P(diag); + P.setComputedInverse(); KokkosBatched::TeamGMRES::template invoke( - member, A, b, x, handle); + member, A, b, x, P, _handle); } inline void run() { @@ -63,20 +77,42 @@ struct Functor_TestBatchedTeamGMRES { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO()); + Kokkos::AUTO(), Kokkos::AUTO()); + + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); + size_t bytes_0 = ValuesViewType::shmem_size(_N_team, n); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); - handle.set_max_iteration(10); + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; - int maximum_iteration = handle.get_max_iteration(); + using ViewType2D = Kokkos::View; - policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); + size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); + + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; policy.set_scratch_size( - 1, Kokkos::PerTeam(maximum_iteration * bytes_0 + - ((maximum_iteration + 3) * maximum_iteration) * - bytes_1)); + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + + // policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -95,6 +131,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { VectorViewType R("r0", N, BlkSize); VectorViewType B("b", N, BlkSize); ValuesViewType D("D", N, nnz); + ValuesViewType Diag("Diag", N, BlkSize); IntView r("r", BlkSize + 1); IntView c("c", nnz); @@ -106,11 +143,41 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B); + { + auto diag_values_host = Kokkos::create_mirror_view(Diag); + auto values_host = Kokkos::create_mirror_view(D); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, D); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) + diag_values_host(j, i) = values_host(j, current_index); + } + + Kokkos::deep_copy(Diag, diag_values_host); + } + // Compute initial norm Kokkos::deep_copy(R, B); @@ -131,6 +198,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(r_host, r); Kokkos::deep_copy(D_host, D); + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + KokkosBatched::SerialSpmv::template invoke< typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, @@ -138,7 +208,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamGMRES(D, r, c, X, B, N_team) + VectorViewType, KrylovHandleType>( + D, r, c, X, B, Diag, N_team, handle) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp index 6637d9858d..d9fb350726 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp @@ -14,7 +14,7 @@ namespace Test { namespace TeamVectorCG { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamVectorCG { const ValuesViewType _D; const IntView _r; @@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamVectorCG { const VectorViewType _X; const VectorViewType _B; const int _N_team; - KrylovHandle handle; + KrylovHandleType handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -96,6 +101,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -128,7 +140,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamVectorCG(D, r, c, X, B, N_team) + VectorViewType, KrylovHandleType>(D, r, c, X, + B, N_team) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp index 87e9da0281..764edc9feb 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp @@ -15,7 +15,7 @@ namespace Test { namespace TeamVectorGMRES { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamVectorGMRES { const ValuesViewType _D; const IntView _r; @@ -24,15 +24,21 @@ struct Functor_TestBatchedTeamVectorGMRES { const VectorViewType _B; const VectorViewType _Diag; const int _N_team; - KrylovHandle handle; + KrylovHandleType _handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const VectorViewType &diag, - const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team) {} + const int N_team, KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + _Diag(diag), + _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -57,10 +63,11 @@ struct Functor_TestBatchedTeamVectorGMRES { Operator A(d, _r, _c); PrecOperator P(diag); + P.setComputedInverse(); KokkosBatched::TeamVectorGMRES::template invoke( - member, A, b, x, P, handle); + member, A, b, x, P, _handle); } inline void run() { @@ -72,18 +79,40 @@ struct Functor_TestBatchedTeamVectorGMRES { Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); + + size_t bytes_0 = ValuesViewType::shmem_size(_N_team, n); size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); - handle.set_max_iteration(10); + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; + + using ViewType2D = Kokkos::View; - int maximum_iteration = handle.get_max_iteration(); + size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); - policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; policy.set_scratch_size( - 1, Kokkos::PerTeam(maximum_iteration * bytes_0 + - ((maximum_iteration + 3) * maximum_iteration) * - bytes_1)); + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + + // policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -114,6 +143,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -162,6 +198,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(r_host, r); Kokkos::deep_copy(D_host, D); + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + KokkosBatched::SerialSpmv::template invoke< typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, @@ -169,8 +208,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamVectorGMRES(D, r, c, X, B, Diag, - N_team) + VectorViewType, KrylovHandleType>( + D, r, c, X, B, Diag, N_team, handle) .run(); Kokkos::fence(); From 7504347637fe1f3c1627734178fb65b0f1ba48ff Mon Sep 17 00:00:00 2001 From: Kim Liegeois Date: Tue, 26 Apr 2022 07:45:15 -0600 Subject: [PATCH 112/261] Treat warnings as errors --- example/batched_solve/team_GMRES.cpp | 7 +------ unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp | 2 +- unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp | 7 +------ unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp | 7 +------ 4 files changed, 4 insertions(+), 19 deletions(-) diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index b94ad00709..188fcc54f3 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -232,11 +232,6 @@ struct Functor_TestBatchedTeamVectorGMRES { size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); - size_t bytes_3D_1 = - ViewType3D::shmem_size(_N_team, _X.extent(1), maximum_iteration); - size_t bytes_3D_2 = ViewType3D::shmem_size(_N_team, maximum_iteration + 1, - maximum_iteration); - size_t bytes_3D_3 = ViewType3D::shmem_size(_N_team, 2, maximum_iteration); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; @@ -267,7 +262,7 @@ int main(int /*argc*/, char ** /*argv*/) { std::string name_A = "mat.mm"; std::string name_B = "rhs.mm"; - int N, Blk, nnz, ncols; + int N, Blk, nnz; Blk = 10; N = 100; diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp index 15af38bef5..108a984a9d 100644 --- a/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp @@ -36,8 +36,8 @@ struct Functor_TestBatchedSerialGMRES { _c(c), _X(X), _B(B), - _N_team(N_team), _Diag(diag), + _N_team(N_team), _handle(handle) {} KOKKOS_INLINE_FUNCTION void operator()(const int k) const { diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp index 1cf2cf0866..553d4d3419 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp @@ -36,8 +36,8 @@ struct Functor_TestBatchedTeamGMRES { _c(c), _X(X), _B(B), - _N_team(N_team), _Diag(diag), + _N_team(N_team), _handle(handle) {} template @@ -83,9 +83,6 @@ struct Functor_TestBatchedTeamGMRES { const int n = _X.extent(1); const int maximum_iteration = _handle.get_max_iteration(); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, n); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); - _handle.set_ortho_strategy(0); _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); @@ -112,8 +109,6 @@ struct Functor_TestBatchedTeamGMRES { policy.set_scratch_size( 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); - // policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); - Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp index 764edc9feb..17f72c8963 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp @@ -36,8 +36,8 @@ struct Functor_TestBatchedTeamVectorGMRES { _c(c), _X(X), _B(B), - _N_team(N_team), _Diag(diag), + _N_team(N_team), _handle(handle) {} template @@ -83,9 +83,6 @@ struct Functor_TestBatchedTeamVectorGMRES { const int n = _X.extent(1); const int maximum_iteration = _handle.get_max_iteration(); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, n); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); - _handle.set_ortho_strategy(0); _handle.set_compute_last_residual(false); _handle.set_tolerance(1e-8); @@ -112,8 +109,6 @@ struct Functor_TestBatchedTeamVectorGMRES { policy.set_scratch_size( 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); - // policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); - Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); } From 4fb43df87ff7045f79cfc2841fb518f05136a441 Mon Sep 17 00:00:00 2001 From: Kim Liegeois Date: Tue, 26 Apr 2022 10:50:32 -0600 Subject: [PATCH 113/261] Remove other warnings --- example/batched_solve/team_GMRES.cpp | 7 ------- .../sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp | 7 ------- src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp | 7 ------- 3 files changed, 21 deletions(-) diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index 188fcc54f3..a034907091 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -192,7 +192,6 @@ struct Functor_TestBatchedTeamVectorGMRES { } inline double run() { - typedef typename ValuesViewType::value_type value_type; std::string name("KokkosBatched::Test::TeamVectorGMRES"); Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); @@ -220,12 +219,7 @@ struct Functor_TestBatchedTeamVectorGMRES { using Layout = typename ValuesViewType::array_layout; using EXSP = typename ValuesViewType::execution_space; - using MagnitudeType = - typename Kokkos::Details::ArithTraits::mag_type; - - using ViewType1D = Kokkos::View; using ViewType2D = Kokkos::View; - using ViewType3D = Kokkos::View; size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); @@ -294,7 +288,6 @@ int main(int /*argc*/, char ** /*argv*/) { using MagnitudeType = typename Kokkos::Details::ArithTraits::mag_type; - using NormViewType = Kokkos::View; using Norm2DViewType = Kokkos::View; using Scalar3DViewType = Kokkos::View; diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index a95b712cbb..dfc9d96518 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -79,17 +79,10 @@ struct TeamVectorGMRES { typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; typedef Kokkos::Details::ArithTraits ATM; - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadVectorViewType = Kokkos::View< typename VectorViewType::non_const_value_type**, typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadMultiVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamVectorCopy1D = TeamVectorCopy; const OrdinalType numMatrices = _X.extent(0); diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 58d136e69c..fdbde3d278 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -78,17 +78,10 @@ struct TeamGMRES { typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; typedef Kokkos::Details::ArithTraits ATM; - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; using ScratchPadVectorViewType = Kokkos::View< typename VectorViewType::non_const_value_type**, typename VectorViewType::array_layout, typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadMultiVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; using TeamCopy1D = TeamCopy; const OrdinalType numMatrices = _X.extent(0); From b66bd0b200b1c7cb16a62744137056abd6b479dd Mon Sep 17 00:00:00 2001 From: kliegeois Date: Wed, 27 Apr 2022 08:47:32 -0600 Subject: [PATCH 114/261] Update PR with Luc's comments --- example/batched_solve/team_GMRES.cpp | 86 +-- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 1 + .../sparse/KokkosBatched_CrsMatrix.hpp | 4 +- src/batched/sparse/KokkosBatched_GMRES.hpp | 1 + .../sparse/KokkosBatched_Krylov_Handle.hpp | 207 ++++--- .../sparse/KokkosBatched_Krylov_Solvers.hpp | 129 ++++ .../impl/KokkosBatched_CG_TeamVector_Impl.hpp | 256 ++++---- .../impl/KokkosBatched_CG_Team_Impl.hpp | 256 ++++---- .../impl/KokkosBatched_GMRES_Serial_Impl.hpp | 459 +++++++------- .../KokkosBatched_GMRES_TeamVector_Impl.hpp | 582 +++++++++--------- .../impl/KokkosBatched_GMRES_Team_Impl.hpp | 577 +++++++++-------- 11 files changed, 1339 insertions(+), 1219 deletions(-) create mode 100644 src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp index a034907091..404e573491 100644 --- a/example/batched_solve/team_GMRES.cpp +++ b/example/batched_solve/team_GMRES.cpp @@ -69,76 +69,55 @@ typedef Kokkos::DefaultExecutionSpace exec_space; template struct Functor_TestBatchedTeamVectorGMRES { - const ValuesViewType _D; + const ValuesViewType _values; const ValuesViewType _diag; const IntView _r; const IntView _c; const VectorViewType _X; const VectorViewType _B; - const int _N_team, _team_size, _vector_length; - const int _N_iteration; - const double _tol; - const int _ortho_strategy; - const int _scratch_pad_level; + const int _team_size, _vector_length; KrylovHandleType _handle; KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &D, const IntView &r, const IntView &c, - const VectorViewType &X, const VectorViewType &B, const int N_team, - const int team_size, const int vector_length, const int N_iteration, - const double tol, const int ortho_strategy, const int scratch_pad_level, - KrylovHandleType &handle) - : _D(D), + const ValuesViewType &values, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int team_size, + const int vector_length, KrylovHandleType &handle) + : _values(values), _r(r), _c(c), _X(X), _B(B), - _N_team(N_team), _team_size(team_size), _vector_length(vector_length), - _N_iteration(N_iteration), - _tol(tol), - _ortho_strategy(ortho_strategy), - _scratch_pad_level(scratch_pad_level), _handle(handle) {} KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorGMRES( - const ValuesViewType &D, const ValuesViewType &diag, const IntView &r, - const IntView &c, const VectorViewType &X, const VectorViewType &B, - const int N_team, const int team_size, const int vector_length, - const int N_iteration, const double tol, int ortho_strategy, - const int scratch_pad_level, KrylovHandleType &handle) - : _D(D), + const ValuesViewType &values, const ValuesViewType &diag, + const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const int team_size, const int vector_length, + KrylovHandleType &handle) + : _values(values), _diag(diag), _r(r), _c(c), _X(X), _B(B), - _N_team(N_team), _team_size(team_size), _vector_length(vector_length), - _N_iteration(N_iteration), - _tol(tol), - _ortho_strategy(ortho_strategy), - _scratch_pad_level(scratch_pad_level), _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { - const int first_matrix = static_cast(member.league_rank()) * _N_team; - const int N = _D.extent(0); - const int last_matrix = - (static_cast(member.league_rank() + 1) * _N_team < N - ? static_cast(member.league_rank() + 1) * _N_team - : N); + const int first_matrix = _handle.first_index(member.league_rank()); + const int last_matrix = _handle.last_index(member.league_rank()); using TeamVectorCopy1D = KokkosBatched::TeamVectorCopy; - auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL); + auto d = Kokkos::subview( + _values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), @@ -196,10 +175,10 @@ struct Functor_TestBatchedTeamVectorGMRES { Kokkos::Timer timer; Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::TeamPolicy auto_policy( - ceil(1. * _D.extent(0) / _N_team), Kokkos::AUTO(), Kokkos::AUTO()); - Kokkos::TeamPolicy tuned_policy( - ceil(1. * _D.extent(0) / _N_team), _team_size, _vector_length); + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), + Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), + _team_size, _vector_length); Kokkos::TeamPolicy policy; if (_team_size < 1) @@ -207,12 +186,6 @@ struct Functor_TestBatchedTeamVectorGMRES { else policy = tuned_policy; - _handle.set_max_iteration(_N_iteration); - _handle.set_tolerance(_tol); - _handle.set_ortho_strategy(_ortho_strategy); - _handle.set_scratch_pad_level(_scratch_pad_level); - _handle.set_compute_last_residual(true); - int maximum_iteration = _handle.get_max_iteration(); using ScalarType = typename ValuesViewType::non_const_value_type; @@ -221,11 +194,14 @@ struct Functor_TestBatchedTeamVectorGMRES { using ViewType2D = Kokkos::View; - size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_1D = + ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); - size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); - size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); + size_t bytes_2D_1 = ViewType2D::shmem_size( + _handle.get_number_of_systems_per_team(), _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size( + _handle.get_number_of_systems_per_team(), maximum_iteration + 1); size_t bytes_int = bytes_row_ptr + bytes_col_idc; size_t bytes_diag = bytes_2D_1; @@ -309,12 +285,18 @@ int main(int /*argc*/, char ** /*argv*/) { handle.Arnoldi_view = Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + handle.set_max_iteration(n_iterations); + handle.set_tolerance(tol); + handle.set_ortho_strategy(ortho_strategy); + handle.set_scratch_pad_level(0); + handle.set_compute_last_residual(true); + double time = Functor_TestBatchedTeamVectorGMRES( - values, diag, rowOffsets, colIndices, x, y, N_team, team_size, - vector_length, n_iterations, tol, ortho_strategy, 0, handle) + true>(values, diag, rowOffsets, + colIndices, x, y, team_size, + vector_length, handle) .run(); printf("times = %f secondes\n", time); diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 8315a59ce6..cc3f6d27ff 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -24,6 +24,7 @@ struct TeamGemvInternal { const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT y, const int ys0); + template KOKKOS_INLINE_FUNCTION static int invoke( diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp index 1d3edcd343..d7fd94744f 100644 --- a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp +++ b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp @@ -111,7 +111,7 @@ class CrsMatrix { MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), MagnitudeType beta = Kokkos::Details::ArithTraits::zero()) const { - if (beta == 0) + if (beta == Kokkos::Details::ArithTraits::zero()) KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( member, alpha, values, row_ptr, colIndices, X, beta, Y); @@ -127,7 +127,7 @@ class CrsMatrix { MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), MagnitudeType beta = Kokkos::Details::ArithTraits::zero()) const { - if (beta == 0) + if (beta == Kokkos::Details::ArithTraits::zero()) KokkosBatched::SerialSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( alpha, values, row_ptr, colIndices, X, beta, Y); diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp index 5a7a8a7749..51efc24aed 100644 --- a/src/batched/sparse/KokkosBatched_GMRES.hpp +++ b/src/batched/sparse/KokkosBatched_GMRES.hpp @@ -60,6 +60,7 @@ /// \param handle [in]: a handle which provides different information such as /// the tolerance or the maximal number of iterations of the solver. +#include #include "KokkosBatched_Krylov_Handle.hpp" #include "KokkosBatched_GMRES_Serial_Impl.hpp" #include "KokkosBatched_GMRES_Team_Impl.hpp" diff --git a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp index 1faabcc993..3467a6f910 100644 --- a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp +++ b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp @@ -42,19 +42,36 @@ //@HEADER */ -#include -#include -#include - #ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ #define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ -//#define VERBOSE + +#include +#include namespace KokkosBatched { /// \brief KrylovHandle /// -/// \tparam scalar_type: Scalar type of the linear solver +/// The handle is used to pass information between the Krylov solver and the +/// calling code. +/// +/// The handle has some views as data member, their required size can be +/// different depending on the used Krylov solver. +/// +/// In the case of the Batched GMRES, the size should be as follows: +/// - Arnoldi_view a batched_size x max_iteration x (n_rows + max_iteration + +/// 3); +/// - tmp_view is NOT used for the team/teamvector GMRES; +/// it is used for the serial GMRES and the size is batched_size x (n_rows + +/// max_iteration + 3); +/// - residual_norms is an optional batched_size x (max_iteration + 2) used to +/// store the convergence history; +/// - iteration_numbers is a 1D view of length batched_size; +/// - first_index and last_index are 1D of length n_teams. +/// +/// \tparam NormViewType: type of the view used to store the convergence history +/// \tparam IntViewType: type of the view used to store the number of iteration +/// per system \tparam ViewType3D: type of the 3D temporary views template class KrylovHandle { @@ -82,7 +99,8 @@ class KrylovHandle { norm_type max_tolerance; int max_iteration; int batched_size; - int N_team; + const int N_team; + int n_teams; int ortho_strategy; int scratch_pad_level; bool compute_last_residual; @@ -105,7 +123,7 @@ class KrylovHandle { iteration_numbers = IntViewType("", batched_size); Kokkos::deep_copy(iteration_numbers, -1); - int n_teams = ceil(1. * batched_size / N_team); + n_teams = ceil(1. * batched_size / N_team); first_index = IntViewType("", n_teams); last_index = IntViewType("", n_teams); @@ -130,6 +148,12 @@ class KrylovHandle { host_synchronised = false; } + /// \brief get_number_of_systems_per_team + int get_number_of_systems_per_team() { return N_team; } + + /// \brief get_number_of_teams + int get_number_of_teams() { return n_teams; } + /// \brief reset /// Reset the iteration numbers to the default value of -1 /// and the residual norms if monitored. @@ -144,6 +168,8 @@ class KrylovHandle { host_synchronised = false; } + /// \brief synchronise_host + /// Synchronise host and device. /// void synchronise_host() { @@ -250,33 +276,6 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION int get_max_iteration() const { return max_iteration; } - /// \brief set_norm - /// Store the norm of one of the system at one of the iteration - /// - /// \param batched_id [in]: Global batched ID - /// \param iteration_id [in]: Iteration ID - /// \param norm_i [in]: Norm to store - - KOKKOS_INLINE_FUNCTION - void set_norm(int batched_id, int iteration_id, norm_type norm_i) const { - if (monitor_residual) residual_norms(batched_id, iteration_id) = norm_i; - } - - /// \brief set_norm - /// Store the norm of one of the system at one of the iteration - /// - /// \param batchedteam_id [in]: Team ID - /// \param batched_id [in]: Local batched ID (local ID within the team) - /// \param iteration_id [in]: Iteration ID - /// \param norm_i [in]: Norm to store - - KOKKOS_INLINE_FUNCTION - void set_norm(int team_id, int batched_id, int iteration_id, - norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; - } - /// \brief get_norm /// Get the norm of one system at a given iteration /// @@ -305,32 +304,6 @@ class KrylovHandle { return 0; } - /// \brief set_last_norm - /// Store the last norm of one system - /// - /// \param batched_id [in]: Global batched ID - /// \param norm_i [in]: Norm to store - - KOKKOS_INLINE_FUNCTION - void set_last_norm(int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(batched_id, max_iteration + 1) = norm_i; - } - - /// \brief set_last_norm - /// Store the last norm of one system - /// - /// \param batchedteam_id [in]: Team ID - /// \param batched_id [in]: Local batched ID (local ID within the team) - /// \param batched_id [in]: Global batched ID - /// \param norm_i [in]: Norm to store - - KOKKOS_INLINE_FUNCTION - void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { - if (monitor_residual) - residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; - } - /// \brief get_last_norm /// Get the last norm of one system /// @@ -357,29 +330,6 @@ class KrylovHandle { return 0; } - /// \brief set_iteration - /// Store the number of iteration after convergence for one system - /// - /// \param batched_id [in]: Global batched ID - /// \param iteration_id [in]: Iteration ID - - KOKKOS_INLINE_FUNCTION - void set_iteration(int batched_id, int iteration_id) const { - iteration_numbers(batched_id) = iteration_id; - } - - /// \brief set_iteration - /// Store the number of iteration after convergence for one system - /// - /// \param batchedteam_id [in]: Team ID - /// \param batched_id [in]: Local batched ID (local ID within the team) - /// \param iteration_id [in]: Iteration ID - - KOKKOS_INLINE_FUNCTION - void set_iteration(int team_id, int batched_id, int iteration_id) const { - iteration_numbers(team_id * N_team + batched_id) = iteration_id; - } - /// \brief get_iteration /// Get the number of iteration after convergence for one system /// @@ -460,6 +410,95 @@ class KrylovHandle { else return false; } + + private: + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int batched_id, int iteration_id, norm_type norm_i) const { + if (monitor_residual) residual_norms(batched_id, iteration_id) = norm_i; + } + + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int team_id, int batched_id, int iteration_id, + norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(batched_id, max_iteration + 1) = norm_i; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int batched_id, int iteration_id) const { + iteration_numbers(batched_id) = iteration_id; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int team_id, int batched_id, int iteration_id) const { + iteration_numbers(team_id * N_team + batched_id) = iteration_id; + } + + public: + friend struct SerialGMRES; + template + friend struct TeamGMRES; + template + friend struct TeamVectorGMRES; + + template + friend struct TeamCG; + template + friend struct TeamVectorCG; }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp new file mode 100644 index 0000000000..413c72678f --- /dev/null +++ b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp @@ -0,0 +1,129 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ +#define __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ + +namespace KokkosBatched { + +struct SerialGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const int GMRES_id); + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamVectorGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamCG { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamVectorCG { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +} // namespace KokkosBatched + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index f32c02417c..11dc805a0c 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -61,149 +61,145 @@ namespace KokkosBatched { /// template -struct TeamVectorCG { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - - const size_t maximum_iteration = handle.get_max_iteration(); - const MagnitudeType tolerance = handle.get_tolerance(); - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamVectorCopy1D = TeamVectorCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - ScratchPadVectorViewType P( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType Q( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType R( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType X( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - - ScratchPadNormViewType sqr_norm_0( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType sqr_norm_j( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType alpha( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType mask( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType tmp( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - - TeamVectorCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamVectorCopy::invoke(member, _B, R); - - // r_0 := b - A x_0 +template +KOKKOS_INLINE_FUNCTION int TeamVectorCG::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + + const size_t maximum_iteration = handle.get_max_iteration(); + const MagnitudeType tolerance = handle.get_tolerance(); + + using ScratchPadNormViewType = Kokkos::View< + MagnitudeType*, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamVectorCopy1D = TeamVectorCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + + TeamVectorCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamVectorCopy::invoke(member, _B, R); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, R, -1, 1); + member.team_barrier(); + + // Deep copy of r_0 into p_0: + TeamVectorCopy::invoke(member, R, P); + + TeamVectorDot::invoke(member, R, R, sqr_norm_0); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + mask(i) = + sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + }); + + TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); + + int status = 1; + int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + A.template apply(member, P, Q); member.team_barrier(); - A.template apply(member, X, R, -1, 1); + + TeamVectorDot::invoke(member, P, Q, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + alpha(i) = + mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; + }); + member.team_barrier(); + + // x_{j+1} := alpha p_j + x_j + TeamVectorAxpy::invoke(member, alpha, P, X); + member.team_barrier(); + + // r_{j+1} := - alpha q + r_j + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); member.team_barrier(); - // Deep copy of r_0 into p_0: - TeamVectorCopy::invoke(member, R, P); + TeamVectorAxpy::invoke(member, alpha, Q, R); + member.team_barrier(); - TeamVectorDot::invoke(member, R, R, sqr_norm_0); + TeamVectorDot::invoke(member, R, R, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + alpha(i) = + mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); - TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); - - int status = 1; - int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - // q := A p_j - A.template apply(member, P, Q); - member.team_barrier(); - - TeamVectorDot::invoke(member, P, Q, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); - member.team_barrier(); - - // x_{j+1} := alpha p_j + x_j - TeamVectorAxpy::invoke(member, alpha, P, X); - member.team_barrier(); - - // r_{j+1} := - alpha q + r_j - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); - member.team_barrier(); - - TeamVectorAxpy::invoke(member, alpha, Q, R); - member.team_barrier(); - - TeamVectorDot::invoke(member, R, R, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); - - TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); - - // Relative convergence check: - number_not_converged = 0; - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i, int& lnumber_not_converged) { - if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) - ++lnumber_not_converged; - else - mask(i) = 0.; - }, - number_not_converged); - - member.team_barrier(); - - if (number_not_converged == 0) { - status = 0; - break; - } - - // p_{j+1} := alpha p_j + r_{j+1} - TeamVectorXpay::invoke(member, alpha, R, P); - member.team_barrier(); + TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); + + // Relative convergence check: + number_not_converged = 0; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i, int& lnumber_not_converged) { + if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) + ++lnumber_not_converged; + else + mask(i) = 0.; + }, + number_not_converged); + + member.team_barrier(); + + if (number_not_converged == 0) { + status = 0; + break; } - TeamVectorCopy::invoke(member, X, _X); - return status; + // p_{j+1} := alpha p_j + r_{j+1} + TeamVectorXpay::invoke(member, alpha, R, P); + member.team_barrier(); } -}; + + TeamVectorCopy::invoke(member, X, _X); + return status; +} } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 02328aaf1a..606ad8d714 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -60,149 +60,145 @@ namespace KokkosBatched { /// template -struct TeamCG { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - - size_t maximum_iteration = handle.get_max_iteration(); - const MagnitudeType tolerance = handle.get_tolerance(); - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamCopy1D = TeamCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - ScratchPadVectorViewType P( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType Q( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType R( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - ScratchPadVectorViewType X( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - numRows); - - ScratchPadNormViewType sqr_norm_0( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType sqr_norm_j( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType alpha( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType mask( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - ScratchPadNormViewType tmp( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices); - - TeamCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamCopy::invoke(member, _B, R); - - // r_0 := b - A x_0 +template +KOKKOS_INLINE_FUNCTION int TeamCG::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandle& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + + size_t maximum_iteration = handle.get_max_iteration(); + const MagnitudeType tolerance = handle.get_tolerance(); + + using ScratchPadNormViewType = Kokkos::View< + MagnitudeType*, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamCopy1D = TeamCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + + TeamCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamCopy::invoke(member, _B, R); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, R, -1, 1); + member.team_barrier(); + + // Deep copy of r_0 into p_0: + TeamCopy::invoke(member, R, P); + + TeamDot::invoke(member, R, R, sqr_norm_0); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + mask(i) = + sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + }); + + TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); + + int status = 1; + int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + A.template apply(member, P, Q); member.team_barrier(); - A.template apply(member, X, R, -1, 1); + + TeamDot::invoke(member, P, Q, tmp); member.team_barrier(); - // Deep copy of r_0 into p_0: - TeamCopy::invoke(member, R, P); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + alpha(i) = + mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; + }); + member.team_barrier(); - TeamDot::invoke(member, R, R, sqr_norm_0); + // x_{j+1} := alpha p_j + x_j + TeamAxpy::invoke(member, alpha, P, X); + member.team_barrier(); + + // r_{j+1} := - alpha q + r_j + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); + member.team_barrier(); + + TeamAxpy::invoke(member, alpha, Q, R); + member.team_barrier(); + + TeamDot::invoke(member, R, R, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + alpha(i) = + mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); - TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); - - int status = 1; - int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - // q := A p_j - A.template apply(member, P, Q); - member.team_barrier(); - - TeamDot::invoke(member, P, Q, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); - member.team_barrier(); - - // x_{j+1} := alpha p_j + x_j - TeamAxpy::invoke(member, alpha, P, X); - member.team_barrier(); - - // r_{j+1} := - alpha q + r_j - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); - member.team_barrier(); - - TeamAxpy::invoke(member, alpha, Q, R); - member.team_barrier(); - - TeamDot::invoke(member, R, R, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); - - TeamCopy1D::invoke(member, tmp, sqr_norm_j); - - // Relative convergence check: - number_not_converged = 0; - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i, int& lnumber_not_converged) { - if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) - ++lnumber_not_converged; - else - mask(i) = 0.; - }, - number_not_converged); - - member.team_barrier(); - - if (number_not_converged == 0) { - status = 0; - break; - } - - // p_{j+1} := alpha p_j + r_{j+1} - TeamXpay::invoke(member, alpha, R, P); - member.team_barrier(); + TeamCopy1D::invoke(member, tmp, sqr_norm_j); + + // Relative convergence check: + number_not_converged = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i, int& lnumber_not_converged) { + if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) + ++lnumber_not_converged; + else + mask(i) = 0.; + }, + number_not_converged); + + member.team_barrier(); + + if (number_not_converged == 0) { + status = 0; + break; } - TeamCopy::invoke(member, X, _X); - return status; + // p_{j+1} := alpha p_j + r_{j+1} + TeamXpay::invoke(member, alpha, R, P); + member.team_barrier(); } -}; + + TeamCopy::invoke(member, X, _X); + return status; +} + } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index db6accce2f..213c06c56a 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -62,272 +62,267 @@ namespace KokkosBatched { /// Serial GMRES /// -struct SerialGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle, - const int GMRES_id) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; - - using SerialCopy1D = SerialCopy; - using SerialCopy2D = SerialCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; - const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = handle.get_max_tolerance(); - - int n_V = numRows; - int n_H = maximum_iteration + 1; - int n_Givens = 2; - - int offset_V = 0; - int offset_H = offset_V + n_V; - int offset_Givens = offset_H + n_H; - - const int first_matrix = handle.first_index(GMRES_id); - const int last_matrix = handle.last_index(GMRES_id); - - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); - - int n_G = maximum_iteration + 1; - int n_W = numRows; - int n_mask = 1; - - int offset_G = 0; - int offset_W = offset_G + n_G; - int offset_mask = offset_W + n_W; - int offset_tmp = offset_mask + n_mask; - - auto G = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::make_pair(offset_W, offset_W + n_W)); - auto mask = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - offset_mask); - auto tmp = Kokkos::subview(handle.tmp_view, - Kokkos::make_pair(first_matrix, last_matrix), - offset_tmp); - - // Deep copy of b into r_0: - SerialCopy2D::invoke(_B, W); - - // r_0 := b - A x_0 - A.template apply(_X, W, -1, 1); - - P.template apply(W, W); - - SerialDot::invoke(W, W, tmp); - - for (OrdinalType i = 0; i < numMatrices; ++i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(GMRES_id, i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(GMRES_id, i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const int GMRES_id) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using SerialCopy1D = SerialCopy; + using SerialCopy2D = SerialCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(GMRES_id); + const int last_matrix = handle.last_index(GMRES_id); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_mask = offset_W + n_W; + int offset_tmp = offset_mask + n_mask; + + auto G = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto mask = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + offset_mask); + auto tmp = + Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); + + // Deep copy of b into r_0: + SerialCopy2D::invoke(_B, W); + + // r_0 := b - A x_0 + A.template apply(_X, W, -1, 1); + + P.template apply(W, W); + + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(GMRES_id, i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(GMRES_id, i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; } + } - auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - } + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); } - int status = 1; - // int number_not_converged = 0; + } + int status = 1; + // int number_not_converged = 0; - for (size_t j = 0; j < maximum_iteration; ++j) { - // q := A p_j - auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); - A.template apply(V_j, W); + A.template apply(V_j, W); - P.template apply(W, W); + P.template apply(W, W); - if (handle.get_ortho_strategy() == 0) { - for (OrdinalType l = 0; l < numMatrices; ++l) { - auto W_l = Kokkos::subview(W, l, Kokkos::ALL); - auto V_old = Kokkos::subview( - V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = - Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + auto W_l = Kokkos::subview(W, l, Kokkos::ALL); + auto V_old = Kokkos::subview( + V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = + Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); - // Inner products - SerialGemv::invoke( - 1, V_old, W_l, 0, H_old); + // Inner products + SerialGemv::invoke( + 1, V_old, W_l, 0, H_old); - // Update - SerialGemv::invoke( - -1, V_old, H_old, 1, W_l); - } + // Update + SerialGemv::invoke( + -1, V_old, H_old, 1, W_l); } - if (handle.get_ortho_strategy() == 1) { - for (size_t i = 0; i < j + 1; ++i) { - auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); - SerialDot::invoke(W, V_i, tmp); - SerialCopy1D::invoke(tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); - for (OrdinalType ii = 0; ii < numMatrices; ++ii) tmp(ii) = -tmp(ii); - - SerialAxpy::invoke(tmp, V_i, W); - } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + SerialDot::invoke(W, V_i, tmp); + SerialCopy1D::invoke(tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); + for (OrdinalType ii = 0; ii < numMatrices; ++ii) tmp(ii) = -tmp(ii); + + SerialAxpy::invoke(tmp, V_i, W); } + } - SerialDot::invoke(W, W, tmp); + SerialDot::invoke(W, W, tmp); - for (OrdinalType i = 0; i < numMatrices; ++i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = - H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; - } + for (OrdinalType i = 0; i < numMatrices; ++i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = + H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + } - if (j + 1 < maximum_iteration) { - auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { - for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - } + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); } } + } - for (OrdinalType l = 0; l < numMatrices; ++l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; + for (OrdinalType l = 0; l < numMatrices; ++l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; } - auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - handle.set_norm(GMRES_id, l, j + 1, res_norm); + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(GMRES_id, l, j + 1); - } - } + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; - bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); - if (all_converged) { - maximum_iteration = j + 1; - break; + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; } - } - for (OrdinalType l = 0; l < numMatrices; ++l) { - for (size_t i = 0; i < maximum_iteration; ++i) { - size_t row_i = maximum_iteration - 1 - i; - for (size_t j = row_i + 1; j < maximum_iteration; ++j) - G(l, row_i) -= H_view(l, j, row_i) * G(l, j); - G(l, row_i) /= H_view(l, row_i, row_i); - } - } + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); - if (handle.get_ortho_strategy() == 0) { - for (OrdinalType l = 0; l < numMatrices; ++l) { - SerialGemv::invoke( - 1, - Kokkos::subview(V_view, l, - Kokkos::make_pair(0, (int)maximum_iteration), - Kokkos::ALL), - Kokkos::subview(G, l, Kokkos::make_pair(0, (int)maximum_iteration)), - 1, Kokkos::subview(_X, l, Kokkos::ALL)); + handle.set_norm(GMRES_id, l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(GMRES_id, l, j + 1); } } - if (handle.get_ortho_strategy() == 1) { - for (size_t j = 0; j < maximum_iteration; ++j) { - SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), - _X); - } + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; } + } - if (handle.get_compute_last_residual()) { - SerialCopy2D::invoke(_B, W); - A.template apply(_X, W, -1, 1); - P.template apply(W, W); - SerialDot::invoke(W, W, tmp); + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - for (OrdinalType i = 0; i < numMatrices; ++i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(GMRES_id, i, tmp(i)); - } + for (OrdinalType l = 0; l < numMatrices; ++l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); + + SerialTrsm::template invoke(1, A_l, B_l); + } + + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + SerialGemv::invoke( + 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), + Kokkos::subview(G, l, first_indices), 1, + Kokkos::subview(_X, l, Kokkos::ALL)); + } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), + _X); } - return status; } - template - KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { - Identity P; - return invoke(A, _B, _X, P, handle); + if (handle.get_compute_last_residual()) { + SerialCopy2D::invoke(_B, W); + A.template apply(_X, W, -1, 1); + P.template apply(W, W); + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(GMRES_id, i, tmp(i)); + } } -}; + return status; +} + +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { + Identity P; + return invoke(A, _B, _X, P, handle); +} } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index dfc9d96518..b3696cf9a9 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -65,334 +65,324 @@ namespace KokkosBatched { /// template -struct TeamVectorGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; - - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamVectorCopy1D = TeamVectorCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; - const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = handle.get_max_tolerance(); - - int n_V = numRows; - int n_H = maximum_iteration + 1; - int n_Givens = 2; - - int offset_V = 0; - int offset_H = offset_V + n_V; - int offset_Givens = offset_H + n_H; - - const int first_matrix = handle.first_index(member.league_rank()); - const int last_matrix = handle.last_index(member.league_rank()); - - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); - - int n_G = maximum_iteration + 1; - int n_W = numRows; - int n_X = numRows; - int n_mask = 1; - int n_tmp = 1; - - int offset_G = 0; - int offset_W = offset_G + n_G; - int offset_X = offset_W + n_W; - int offset_mask = offset_X + n_X; - int offset_tmp = offset_mask + n_mask; - - ScratchPadVectorViewType tmp_2D( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_X + n_mask + n_tmp); - - auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); - auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + n_X)); - auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); - auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); - - TeamVectorCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamVectorCopy::invoke(member, _B, W); - - // r_0 := b - A x_0 - member.team_barrier(); - A.template apply(member, X, W, -1, 1); +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamVectorCopy1D = TeamVectorCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + + TeamVectorCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamVectorCopy::invoke(member, _B, W); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + + P.template apply(member, W, W); + member.team_barrier(); + + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); + + member.team_barrier(); // Finish writing to tmp + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + member.team_barrier(); // Finish writing to V + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(member, V_j, W); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); - TeamVectorDot::invoke(member, W, W, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); - - member.team_barrier(); // Finish writing to tmp - - auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - int status = 1; - // int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - member.team_barrier(); // Finish writing to V - // q := A p_j - auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); - - A.template apply(member, V_j, W); + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); member.team_barrier(); - - P.template apply(member, W, W); + // Inner products + TeamVectorGemv::invoke(member, 1, V_old, W, 0, + H_old); member.team_barrier(); - if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + // Update + TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, + W); + member.team_barrier(); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + TeamVectorDot::invoke(member, W, V_i, tmp); member.team_barrier(); - // Inner products - TeamVectorGemv::invoke(member, 1, V_old, W, 0, - H_old); + TeamVectorCopy1D::invoke(member, tmp, + Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); - // Update - TeamVectorGemv::invoke(member, -1, V_old, H_old, - 1, W); - member.team_barrier(); - } - if (handle.get_ortho_strategy() == 1) { - for (size_t i = 0; i < j + 1; ++i) { - auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); - TeamVectorDot::invoke(member, W, V_i, tmp); - member.team_barrier(); - TeamVectorCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); - member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); - - member.team_barrier(); // Finish writing to tmp - - TeamVectorAxpy::invoke(member, tmp, V_i, W); - member.team_barrier(); // Finish writing to W - } - } + member.team_barrier(); // Finish writing to tmp - member.team_barrier(); // Finish writing to W - TeamVectorDot::invoke(member, W, W, tmp); - member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); - member.team_barrier(); - if (j + 1 < maximum_iteration) { - auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - member.team_barrier(); + TeamVectorAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W } + } + member.team_barrier(); // Finish writing to W + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); + member.team_barrier(); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = - -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + member.team_barrier(); + } + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; } - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); + member.team_barrier(); - handle.set_norm(member.league_rank(), l, j + 1, res_norm); + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); - member.team_barrier(); + member.team_barrier(); // Finish writing to G - bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); - if (all_converged) { - maximum_iteration = j + 1; - break; - } - } + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - member.team_barrier(); // Finish writing to G + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - for (size_t i = 0; i < maximum_iteration; ++i) { - size_t row_i = maximum_iteration - 1 - i; - for (size_t j = row_i + 1; j < maximum_iteration; - ++j) - G(l, row_i) -= H_view(l, j, row_i) * G(l, j); - G(l, row_i) /= H_view(l, row_i, row_i); - } - }); + SerialTrsm::template invoke(1, A_l, B_l); + }); - member.team_barrier(); // Finish writing to G + member.team_barrier(); // Finish writing to G - if (handle.get_ortho_strategy() == 0) { - TeamVectorGemv:: - invoke(member, 1, - Kokkos::subview(V_view, Kokkos::ALL, - Kokkos::make_pair(0, (int)maximum_iteration), - Kokkos::ALL), - Kokkos::subview(G, Kokkos::ALL, - Kokkos::make_pair(0, (int)maximum_iteration)), - 1, X); - } - if (handle.get_ortho_strategy() == 1) { - for (size_t j = 0; j < maximum_iteration; ++j) { - TeamVectorAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X - } + if (handle.get_ortho_strategy() == 0) { + TeamVectorGemv::invoke( + member, 1, + Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + TeamVectorAxpy::invoke( + member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); + member.team_barrier(); // Finish writing to X } + } - member.team_barrier(); // Finish writing to X + member.team_barrier(); // Finish writing to X - TeamVectorCopy::invoke(member, X, _X); + TeamVectorCopy::invoke(member, X, _X); - member.team_barrier(); + member.team_barrier(); - if (handle.get_compute_last_residual()) { - TeamVectorCopy::invoke(member, _B, W); - member.team_barrier(); - A.template apply(member, X, W, -1, - 1); - member.team_barrier(); - P.template apply(member, W, W); - member.team_barrier(); - TeamVectorDot::invoke(member, W, W, tmp); - member.team_barrier(); + if (handle.get_compute_last_residual()) { + TeamVectorCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); - } - return status; + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); } + return status; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + Identity P; + return invoke(member, A, _B, _X, P, + handle); +} - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { - Identity P; - return invoke(member, A, _B, _X, P, - handle); - } -}; } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index fdbde3d278..b09a5c7b93 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -64,331 +64,322 @@ namespace KokkosBatched { /// template -struct TeamGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const PrecOperatorType& P, - const KrylovHandleType& handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; - - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamCopy1D = TeamCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; - const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = handle.get_max_tolerance(); - - int n_V = numRows; - int n_H = maximum_iteration + 1; - int n_Givens = 2; - - int offset_V = 0; - int offset_H = offset_V + n_V; - int offset_Givens = offset_H + n_H; - - const int first_matrix = handle.first_index(member.league_rank()); - const int last_matrix = handle.last_index(member.league_rank()); - - auto V_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); - auto H_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); - auto Givens_view = Kokkos::subview( - handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), - Kokkos::ALL, - Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); - - int n_G = maximum_iteration + 1; - int n_W = numRows; - int n_X = numRows; - int n_mask = 1; - int n_tmp = 1; - - int offset_G = 0; - int offset_W = offset_G + n_G; - int offset_X = offset_W + n_W; - int offset_mask = offset_X + n_X; - int offset_tmp = offset_mask + n_mask; - - ScratchPadVectorViewType tmp_2D( - member.team_scratch(handle.get_scratch_pad_level()), numMatrices, - n_G + n_W + n_X + n_mask + n_tmp); - - auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_G, offset_G + n_G)); - auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_W, offset_W + n_W)); - auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, - Kokkos::make_pair(offset_X, offset_X + n_X)); - auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); - auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); - - TeamCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamCopy::invoke(member, _B, W); - - // r_0 := b - A x_0 - member.team_barrier(); - A.template apply(member, X, W, -1, 1); +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamCopy1D = TeamCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + + TeamCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamCopy::invoke(member, _B, W); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + + P.template apply(member, W, W); + member.team_barrier(); + + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); + + member.team_barrier(); // Finish writing to tmp + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + member.team_barrier(); // Finish writing to V + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(member, V_j, W); member.team_barrier(); P.template apply(member, W, W); member.team_barrier(); - TeamDot::invoke(member, W, W, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_norm(member.league_rank(), i, 0, tmp(i)); - if (tmp(i) > max_tolerance) { - mask(i) = 1; - G(i, 0) = tmp(i); - tmp(i) = 1. / tmp(i); - } else { - handle.set_iteration(member.league_rank(), i, 0); - mask(i) = 0; - G(i, 0) = 0.; - tmp(i) = 0.; - } - }); - - member.team_barrier(); // Finish writing to tmp - - auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - int status = 1; - // int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - member.team_barrier(); // Finish writing to V - // q := A p_j - auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); - - A.template apply(member, V_j, W); + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); member.team_barrier(); - - P.template apply(member, W, W); + // Inner products + TeamGemv::invoke( + member, 1, V_old, W, 0, H_old); member.team_barrier(); - if (handle.get_ortho_strategy() == 0) { - auto V_old = Kokkos::subview( - V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); - auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, - Kokkos::make_pair(0, (int)j + 1)); + // Update + TeamGemv::invoke( + member, -1, V_old, H_old, 1, W); + member.team_barrier(); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + TeamDot::invoke(member, W, V_i, tmp); member.team_barrier(); - // Inner products - TeamGemv::invoke( - member, 1, V_old, W, 0, H_old); + TeamCopy1D::invoke(member, tmp, + Kokkos::subview(H_view, Kokkos::ALL, j, i)); member.team_barrier(); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); - // Update - TeamGemv::invoke( - member, -1, V_old, H_old, 1, W); - member.team_barrier(); - } - if (handle.get_ortho_strategy() == 1) { - for (size_t i = 0; i < j + 1; ++i) { - auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); - TeamDot::invoke(member, W, V_i, tmp); - member.team_barrier(); - TeamCopy1D::invoke(member, tmp, - Kokkos::subview(H_view, Kokkos::ALL, j, i)); - member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); - - member.team_barrier(); // Finish writing to tmp - - TeamAxpy::invoke(member, tmp, V_i, W); - member.team_barrier(); // Finish writing to W - } - } + member.team_barrier(); // Finish writing to tmp - member.team_barrier(); // Finish writing to W - TeamDot::invoke(member, W, W, tmp); - member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); - tmp(i) = H_view(i, j, j + 1) > max_tolerance - ? 1. / H_view(i, j, j + 1) - : 0.; - }); - member.team_barrier(); - if (j + 1 < maximum_iteration) { - auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - member.team_barrier(); + TeamAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W } + } + member.team_barrier(); // Finish writing to W + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); + member.team_barrier(); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); - auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); - auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); - auto tmp2 = - -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens_0_l(j) = G_new.first; - Givens_1_l(j) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); - auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens_1_l(j) * G(l, j); - G(l, j) *= Givens_0_l(j); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; + Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + member.team_barrier(); + } + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; } - auto res_norm = - Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } + }); + member.team_barrier(); - handle.set_norm(member.league_rank(), l, j + 1, res_norm); + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } - if (mask(l) == 1. && res_norm < tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - handle.set_iteration(member.league_rank(), l, j + 1); - } - }); - member.team_barrier(); + member.team_barrier(); // Finish writing to G - bool all_converged = true; - for (OrdinalType l = 0; l < numMatrices; ++l) - all_converged = (all_converged && mask(l) == 0.); - if (all_converged) { - maximum_iteration = j + 1; - break; - } - } + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); - member.team_barrier(); // Finish writing to G + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - for (size_t i = 0; i < maximum_iteration; ++i) { - size_t row_i = maximum_iteration - 1 - i; - for (size_t j = row_i + 1; j < maximum_iteration; - ++j) - G(l, row_i) -= H_view(l, j, row_i) * G(l, j); - G(l, row_i) /= H_view(l, row_i, row_i); - } - }); + SerialTrsm::template invoke(1, A_l, B_l); + }); - member.team_barrier(); // Finish writing to G + member.team_barrier(); // Finish writing to G - if (handle.get_ortho_strategy() == 0) { - TeamGemv::invoke( - member, 1, - Kokkos::subview(V_view, Kokkos::ALL, - Kokkos::make_pair(0, (int)maximum_iteration), - Kokkos::ALL), - Kokkos::subview(G, Kokkos::ALL, - Kokkos::make_pair(0, (int)maximum_iteration)), - 1, X); - } - if (handle.get_ortho_strategy() == 1) { - for (size_t j = 0; j < maximum_iteration; ++j) { - TeamAxpy::invoke( - member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); - member.team_barrier(); // Finish writing to X - } + if (handle.get_ortho_strategy() == 0) { + TeamGemv::invoke( + member, 1, + Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + TeamAxpy::invoke( + member, Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); + member.team_barrier(); // Finish writing to X } + } - member.team_barrier(); // Finish writing to X + member.team_barrier(); // Finish writing to X - TeamCopy::invoke(member, X, _X); + TeamCopy::invoke(member, X, _X); - member.team_barrier(); + member.team_barrier(); - if (handle.get_compute_last_residual()) { - TeamCopy::invoke(member, _B, W); - member.team_barrier(); - A.template apply(member, X, W, -1, 1); - member.team_barrier(); - P.template apply(member, W, W); - member.team_barrier(); - TeamDot::invoke(member, W, W, tmp); - member.team_barrier(); + if (handle.get_compute_last_residual()) { + TeamCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - tmp(i) = ATM::sqrt(tmp(i)); - handle.set_last_norm(member.league_rank(), i, - tmp(i)); - }); - } - return status; + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); } + return status; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + Identity P; + return invoke(member, A, _B, _X, P, + handle); +} - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const OperatorType& A, - const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandleType& handle) { - Identity P; - return invoke(member, A, _B, _X, P, - handle); - } -}; } // namespace KokkosBatched #endif From ffaa347749f88ac37eb89d35c8a531d60dda90c8 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 27 Apr 2022 16:12:35 -0600 Subject: [PATCH 115/261] Update View value_type and const_value_type for compile time checks More fixes related to kokkos/kokkos-kernels#1367 --- src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp index 2165387076..f3c6c6bb67 100644 --- a/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp +++ b/src/sparse/impl/KokkosSparse_getDiagCopyWithOffsets_impl.hpp @@ -88,7 +88,8 @@ struct CrsMatrixGetDiagCopyWithOffsetsFunctor { static_cast(DiagType::rank) == 1, "The DiagType template parameter must be a 1-D Kokkos::View."); static_assert( - std::is_same::value, + std::is_same::value, "The DiagType template parameter must be a nonconst Kokkos::View."); static_assert(Kokkos::is_view::value, "The OffsetsType template parameter must be a Kokkos::View."); From 6e1b759d0c100f6b31cdcf91a6436680cf9c95ae Mon Sep 17 00:00:00 2001 From: kliegeois Date: Wed, 27 Apr 2022 17:24:53 -0600 Subject: [PATCH 116/261] Fix expected unqualified-id --- .../sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp | 2 +- src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp | 2 +- src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp | 2 +- .../sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp | 6 +++--- src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp | 6 +++--- src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 3 +-- 6 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 11dc805a0c..a106d0ae8f 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -63,7 +63,7 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamVectorCG::template invoke( +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandleType& handle) { typedef int OrdinalType; diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 606ad8d714..cd7a478548 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -61,7 +61,7 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamCG::template invoke( +KOKKOS_INLINE_FUNCTION int TeamCG::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandle& handle) { typedef int OrdinalType; diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp index 213c06c56a..5e4d0aba9b 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -281,7 +281,7 @@ KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, auto B_l = Kokkos::subview(G, l, first_indices); SerialTrsm::template invoke(1, A_l, B_l); + Algo::Trsm::Unblocked>::invoke(1, A_l, B_l); } if (handle.get_ortho_strategy() == 0) { diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index b3696cf9a9..4d779f9880 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -67,7 +67,7 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle) { @@ -326,7 +326,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( auto B_l = Kokkos::subview(G, l, first_indices); SerialTrsm::template invoke(1, A_l, B_l); + Algo::Trsm::Unblocked>::invoke(1, A_l, B_l); }); member.team_barrier(); // Finish writing to G @@ -375,7 +375,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( template template -KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::template invoke( +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandleType& handle) { Identity P; diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index b09a5c7b93..cc54601d85 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -66,7 +66,7 @@ namespace KokkosBatched { template template -KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const PrecOperatorType& P, const KrylovHandleType& handle) { @@ -323,7 +323,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( auto B_l = Kokkos::subview(G, l, first_indices); SerialTrsm::template invoke(1, A_l, B_l); + Algo::Trsm::Unblocked>::invoke(1, A_l, B_l); }); member.team_barrier(); // Finish writing to G @@ -372,7 +372,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( template template -KOKKOS_INLINE_FUNCTION int TeamGMRES::template invoke( +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( const MemberType& member, const OperatorType& A, const VectorViewType& _B, const VectorViewType& _X, const KrylovHandleType& handle) { Identity P; diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 7943b1e602..fbee2fb33f 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1141,8 +1141,7 @@ struct UpperTriSupernodalFunctor { KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::template invoke(team, one, - Ujj, Xjj); + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); } team.team_barrier(); } From 22972e6d497c2d12cfa3cb6a9ea272a404ad025d Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 29 Apr 2022 01:22:41 -0600 Subject: [PATCH 117/261] Add find method to HashmapAccumulator --- src/common/KokkosKernels_HashmapAccumulator.hpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index b7f39f75c2..90b35711d0 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -780,6 +780,22 @@ struct HashmapAccumulator { return __insert_success; } } + + // function to be called from device. + KOKKOS_INLINE_FUNCTION + size_type find(const key_type &key) { + size_type hash, i; + + if (key == -1) return -1; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return i; + } + } + return -1; + } // end public members private: size_type __max_value_size; From 83b265990d4ffd5a41086a7c9239723d4161229d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 21 Feb 2022 10:20:08 -0700 Subject: [PATCH 118/261] Update develop version for 3.6 release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1f8e9eb167..836b4963c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 5) + SET(KokkosKernels_VERSION_MINOR 6) SET(KokkosKernels_VERSION_PATCH 99) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") From bd78a12eb6bdc8eff0f6bfcb18edc4c656fa1e90 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 24 Feb 2022 15:53:36 -0700 Subject: [PATCH 119/261] Add changelog update for 3.6.00 --- CHANGELOG.md | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7abfc7b730..b0ea4553b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,129 @@ # Change Log +## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) + +### Features: + +#### Batched Sparse Linear algebra +- Kokkos Kernels is adding a new component to the library: batched sparse linear algebra. +- Similarly to the current dense batched algorithms, the new algorithms are called from +- the GPU and provide Team and TeamVector level of parallelism, SpMV also provides a Serial +- call on GPU. + +- Add Batched CG and Batched GMRES [\#1155](https://github.com/kokkos/kokkos-kernels/pull/1155) +- Add Jacobi Batched preconditioner [\#1219](https://github.com/kokkos/kokkos-kernels/pull/1219) + +#### Bsr and Tensor core algorithm for sparse linear algebra +- After introducing the BsrMatrix in release 3.5.0 new algorithms are now supporting this format. +- For release 3.6.0 we are adding matrix-vector (matvec) multiplication and Gauss-Seidel as well as an +- implementation of matvec that leverages tensor cores on Nvidia GPUs. More kernels are expected to +- support the Bsr format in future releases. + +- Add Spmv for BsrMatrix [\#1255](https://github.com/kokkos/kokkos-kernels/pull/1255) +- Add BLAS to SpMV operations for BsrMatrix [\#1297](https://github.com/kokkos/kokkos-kernels/pull/1297) +- BSR format support in block Gauss-Seidel [\#1232](https://github.com/kokkos/kokkos-kernels/pull/1232) +- Experimental tensor-core SpMV for BsrMatrix [\#1090](https://github.com/kokkos/kokkos-kernels/pull/1090) + +#### Improved AMD math libraries support +- rocBLAS and rocSPARSE TPLs are now officially supported, they can be enabled at configure time. +- Initial kernels that can call rocBLAS are GEMV, GEMM, IAMAX and SCAL, while rocSPARSE can be +- called for matrix-vector multiplication. Further support for TPL calls can be requested on slack +- and by GitHub issues. + +- Tpl rocBLAS and rocSPARSE [\#1153](https://github.com/kokkos/kokkos-kernels/pull/1153) +- Add rocBLAS GEMV wrapper [\#1201](https://github.com/kokkos/kokkos-kernels/pull/1201) +- Add rocBLAS wrappers for GEMM, IAMAX, and SCAL [\#1230](https://github.com/kokkos/kokkos-kernels/pull/1230) +- SpMV: adding support for rocSPARSE TPL [\#1221](https://github.com/kokkos/kokkos-kernels/pull/1221) + +#### Additional new features +- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) +- and demostrate GMRES example convergence with bhalf_t (https://github.com/kokkos/kokkos-kernels/pull/1300) +- Stream interface: adding stream support in GEMV and GEMM [\#1131](https://github.com/kokkos/kokkos-kernels/pull/1131) +- Improve double buffering batched gemm performance [\#1217](https://github.com/kokkos/kokkos-kernels/pull/1217) +- Allow choosing coloring algorithm in multicolor GS [\#1199](https://github.com/kokkos/kokkos-kernels/pull/1199) +- Batched: Add armpl dgemm support [\#1256](https://github.com/kokkos/kokkos-kernels/pull/1256) + +### Deprecations: +- Deprecation warning: SpaceAccessibility move out of impl, see #1140 [\#1141](https://github.com/kokkos/kokkos-kernels/pull/1141) + +### Backends and Archs Enhancements: + +#### SYCL: +- Full Blas support on SYCL [\#1270](https://github.com/kokkos/kokkos-kernels/pull/1270) +- Get sparse tests enabled and working for SYCL [\#1269](https://github.com/kokkos/kokkos-kernels/pull/1269) +- Changes to make graph run on SYCL [\#1268](https://github.com/kokkos/kokkos-kernels/pull/1268) +- Allow querying free/total memory for SYCL [\#1225](https://github.com/kokkos/kokkos-kernels/pull/1225) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF instead of printf in kernels [\#1162](https://github.com/kokkos/kokkos-kernels/pull/1162) + +#### HIP: +- Work around hipcc size_t/int division with remainder bug [\#1262](https://github.com/kokkos/kokkos-kernels/pull/1262) + +#### Other Improvements: +- Replace std::abs with ArithTraits::abs [\#1312](https://github.com/kokkos/kokkos-kernels/pull/1312) +- Batched/dense: Add Gemm_DblBuf LayoutLeft operator [\#1299](https://github.com/kokkos/kokkos-kernels/pull/1299) +- KokkosKernels: adding variable that returns version as a single number [\#1295](https://github.com/kokkos/kokkos-kernels/pull/1295) +- Add KOKKOSKERNELS_FORCE_SIMD macro (Fix #1040) [\#1290](https://github.com/kokkos/kokkos-kernels/pull/1290) +- Rename KOKKOS_IF_{HOST,DEVICE} -> KOKKOS_IF_ON_{HOST,DEVICE} [\#1278](https://github.com/kokkos/kokkos-kernels/pull/1278) +- Algo::Level{2,3}::Blocked::mb() [\#1265](https://github.com/kokkos/kokkos-kernels/pull/1265) +- Batched: Use SerialOpt2 for 33 to 39 square matrices [\#1261](https://github.com/kokkos/kokkos-kernels/pull/1261) +- Prune extra dependencies [\#1241](https://github.com/kokkos/kokkos-kernels/pull/1241) +- Improve double buffering batched gemm perf for matrix sizes >64x64 [\#1239](https://github.com/kokkos/kokkos-kernels/pull/1239) +- Improve graph color perf test [\#1229](https://github.com/kokkos/kokkos-kernels/pull/1229) +- Add custom implementation for strcasecmp [\#1227](https://github.com/kokkos/kokkos-kernels/pull/1227) +- Replace __restrict__ with KOKKOS_RESTRICT [\#1223](https://github.com/kokkos/kokkos-kernels/pull/1223) +- Replace array reductions in BLAS-1 MV reductions [\#1204](https://github.com/kokkos/kokkos-kernels/pull/1204) +- Update MIS-2 and aggregation [\#1143](https://github.com/kokkos/kokkos-kernels/pull/1143) +- perf_test/blas/blas3: Update SHAs for benchmarking [\#1139](https://github.com/kokkos/kokkos-kernels/pull/1139) + +### Implemented enhancements BuildSystem +- Bump ROCm version 4.2 -> 4.5 in nightly Jenkins CI build [\#1279](https://github.com/kokkos/kokkos-kernels/pull/1279) +- scripts/cm_test_all_sandia: Add A64FX ci checks [\#1276](https://github.com/kokkos/kokkos-kernels/pull/1276) +- github/workflows: Add osx CI [\#1254](https://github.com/kokkos/kokkos-kernels/pull/1254) +- Update SYCL compiler version in CI [\#1247](https://github.com/kokkos/kokkos-kernels/pull/1247) +- Do not set Kokkos variables when exporting CMake configuration [\#1236](https://github.com/kokkos/kokkos-kernels/pull/1236) +- Add nightly CI check for SYCL [\#1190](https://github.com/kokkos/kokkos-kernels/pull/1190) +- Update cmake minimum version to 3.16 [\#866](https://github.com/kokkos/kokkos-kernels/pull/866) + +### Incompatibilities: +- Kokkos::Impl: removing a few more instances of throw_runtime_exception [\#1320](https://github.com/kokkos/kokkos-kernels/pull/1320) +- Remove Kokkos::Impl::throw_runtime_exception from Kokkos Kernels [\#1294](https://github.com/kokkos/kokkos-kernels/pull/1294) +- Remove unused memory space utility [\#1283](https://github.com/kokkos/kokkos-kernels/pull/1283) +- Clean up Kokkos header includes [\#1282](https://github.com/kokkos/kokkos-kernels/pull/1282) +- Remove private Kokkos header include (Cuda/Kokkos_Cuda_Half.hpp) [\#1281](https://github.com/kokkos/kokkos-kernels/pull/1281) +- Avoid using #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_* macro guards [\#1266](https://github.com/kokkos/kokkos-kernels/pull/1266) +- Rename enumerator Impl::Exec_{PTHREADS -> THREADS} [\#1253](https://github.com/kokkos/kokkos-kernels/pull/1253) +- Remove all references to the Kokkos QThreads backend [\#1238](https://github.com/kokkos/kokkos-kernels/pull/1238) +- Replace more occurences of Kokkos::Impl::is_view [\#1234](https://github.com/kokkos/kokkos-kernels/pull/1234) +- Do not use Kokkos::Impl::is_view [\#1214](https://github.com/kokkos/kokkos-kernels/pull/1214) +- Replace Kokkos::Impl::if_c -> std::conditional [\#1213](https://github.com/kokkos/kokkos-kernels/pull/1213) + +### Bug Fixes: +- Fix bug in spmv_mv_bsrmatrix() for Ampere GPU arch [\#1315](https://github.com/kokkos/kokkos-kernels/pull/1315) +- Fix std::abs calls for rocBLAS/rocSparse [\#1310](https://github.com/kokkos/kokkos-kernels/pull/1310) +- cast literal 0 to fragment scalar type [\#1307](https://github.com/kokkos/kokkos-kernels/pull/1307) +- Fix 1303: maintain correct #cols on A in twostage [\#1304](https://github.com/kokkos/kokkos-kernels/pull/1304) +- Add dimension checking to generic spmv interface [\#1301](https://github.com/kokkos/kokkos-kernels/pull/1301) +- Add missing barriers to TeamGMRES, fix vector len [\#1285](https://github.com/kokkos/kokkos-kernels/pull/1285) +- Examples: fixing some issues related to type checking [\#1267](https://github.com/kokkos/kokkos-kernels/pull/1267) +- Restrict BsrMatrix specialization for AMPERE and VOLTA to CUDA [\#1242](https://github.com/kokkos/kokkos-kernels/pull/1242) +- Fix compilation errors for multi-vectors in kk_print_1Dview() [\#1231](https://github.com/kokkos/kokkos-kernels/pull/1231) +- src/batched: Fixes #1224 [\#1226](https://github.com/kokkos/kokkos-kernels/pull/1226) +- Fix SpGEMM crashing on empty rows [\#1220](https://github.com/kokkos/kokkos-kernels/pull/1220) +- Fix issue #1212 [\#1218](https://github.com/kokkos/kokkos-kernels/pull/1218) +- example/gmres: Specify half_t namespace [\#1208](https://github.com/kokkos/kokkos-kernels/pull/1208) +- Check that ordinal types are signed [\#1188](https://github.com/kokkos/kokkos-kernels/pull/1188) +- Fixing a couple of small issue with tensor core spmv [\#1185](https://github.com/kokkos/kokkos-kernels/pull/1185) +- Fix #threads setting in pcg for OpenMP [\#1182](https://github.com/kokkos/kokkos-kernels/pull/1182) +- SpMV: fix catch all case to avoid compiler warnings [\#1179](https://github.com/kokkos/kokkos-kernels/pull/1179) +- using namespace should be scoped to prevent name clashes [\#1177](https://github.com/kokkos/kokkos-kernels/pull/1177) +- using namespace should be scoped to prevent name clashes, see issue #1170 [\#1171](https://github.com/kokkos/kokkos-kernels/pull/1171) +- Fix bug with mkl impl of spgemm [\#1167](https://github.com/kokkos/kokkos-kernels/pull/1167) +- Add missing $ to KOKKOS_HAS_TRILINOS in sparse_sptrsv_superlu check [\#1160](https://github.com/kokkos/kokkos-kernels/pull/1160) +- Small fixes to spgemm, and plug gaps in testing [\#1159](https://github.com/kokkos/kokkos-kernels/pull/1159) +- SpMV: mismatch in #ifdef check and kernel specialization [\#1151](https://github.com/kokkos/kokkos-kernels/pull/1151) +- Fix values dimension for block sparse matrices [\#1147](https://github.com/kokkos/kokkos-kernels/pull/1147) + ## [3.5.00](https://github.com/kokkos/kokkos-kernels/tree/3.5.00) (2021-10-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.4.01...3.5.00) From 92786cdd1827a2f048c6c3e9ca9d89ed7febb335 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 May 2022 08:11:56 -0600 Subject: [PATCH 120/261] BlockSpGEMM: fix variable shadowing Changing parameter name from blockDim to blkDim to avoid name clash with the CUDA defined blockDim used to launch kernels on Nvidia GPUs. Signed-off-by: Luc Berger-Vergiat --- unit_test/sparse/Test_Sparse_bspgemm.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp index 4d4ee10157..a3ec84fedf 100644 --- a/unit_test/sparse/Test_Sparse_bspgemm.hpp +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -173,7 +173,7 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual, // C := AB, where A is m*k, B is k*n, and C is m*n. template -void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, +void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, lno_t row_size_variance, const bool use_dynamic_scheduling = true, const size_t shared_memory_size = 0) { @@ -188,9 +188,9 @@ void test_bspgemm(lno_t blockDim, lno_t m, lno_t k, lno_t n, size_type nnz, // Generate random compressed sparse row matrix. Randomly generated (non-zero) // values are stored in a 1-D (1 rank) array. bsrMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( - blockDim, m, k, nnz, row_size_variance, bandwidth); + blkDim, m, k, nnz, row_size_variance, bandwidth); bsrMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( - blockDim, k, n, nnz, row_size_variance, bandwidth); + blkDim, k, n, nnz, row_size_variance, bandwidth); const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; From e04cf150037230910bc4a249d5a68378a52337de Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 May 2022 12:06:14 -0600 Subject: [PATCH 121/261] BlockSpGEMM fix: using variable type instead of autp It seems that the intel 17 gets confused when it has to deduce the type of a variable in bspgemm so "auto" has been replaced by the full type of the variable. Signed-off-by: Luc Berger-Vergiat --- src/common/KokkosKernels_BlockUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp index 30a46f36ec..59cb33ef7d 100644 --- a/src/common/KokkosKernels_BlockUtils.hpp +++ b/src/common/KokkosKernels_BlockUtils.hpp @@ -130,7 +130,7 @@ KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, for (size_type col = 0; col < block_dim; ++col) { auto v = &dst[row_offset + col]; auto vb = valB + col; - for (auto va = valA + row_offset, end = va + block_dim; va < end; ++va) { + for (const value_type *va = valA + row_offset, *end = va + block_dim; va < end; ++va) { Kokkos::atomic_add(v, (*va) * (*vb)); vb += block_dim; } From 1c5756d3fadc4eb2c47eb1ebc2e85474c2251445 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 May 2022 15:38:19 -0600 Subject: [PATCH 122/261] Block SpGEMM: fixing issue with lambda function Extended lambda are still not available by default in Kokkos and Kokkos Kernels so their use needs to be inside preprocessor guard. To fix the bspgemm issue, the problematic lambda is re-written as a functor. Signed-off-by: Luc Berger-Vergiat --- src/common/KokkosKernels_Sorting.hpp | 59 ++++++++++++++++++---------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 845a162e51..88f0ff6258 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -581,6 +581,42 @@ KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { b = t; } +template +struct sort_bsr_functor{ + using lno_t = typename entries_type::non_const_value_type; + + row_map_type rowmap; + entries_type entries; + values_type values; + const lno_t blocksize; + + sort_bsr_functor(row_map_type rowmap_, entries_type entries_, values_type values_, const lno_t blocksize_) + : rowmap(rowmap_), entries(entries_), values(values_), blocksize(blocksize_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + const lno_t rowStart = rowmap(i); + const lno_t rowSize = rowmap(i + 1) - rowStart; + auto* e = entries.data() + rowStart; + auto* v = values.data() + rowStart * blocksize; + bool done = false; + while (!done) { + done = true; + for (lno_t j = 1; j < rowSize; ++j) { + const lno_t jp = j - 1; + if (e[jp] <= e[j]) continue; + Impl::kk_swap(e[jp], e[j]); + auto const vb = v + j * blocksize; + auto const vbp = v + jp * blocksize; + for (lno_t k = 0; k < blocksize; + ++k) // std::swap_ranges(vb, vb + blocksize, vbp); + Impl::kk_swap(vb[k], vbp[k]); + done = false; + } + } + } +}; + } // namespace Impl // Sort a BRS matrix: within each row, sort entries ascending by column and @@ -598,29 +634,10 @@ void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const lno_t blocksize = blockdim * blockdim; assert(values.extent(0) == entries.extent(0) * blocksize); + Impl::sort_bsr_functor bsr_sorter(rowmap, entries, values, blocksize); Kokkos::parallel_for( "sort_bsr_matrix", Kokkos::RangePolicy(0, numRows), - KOKKOS_LAMBDA(lno_t i) { - const lno_t rowStart = rowmap(i); - const lno_t rowSize = rowmap(i + 1) - rowStart; - auto* e = entries.data() + rowStart; - auto* v = values.data() + rowStart * blocksize; - bool done = false; - while (!done) { - done = true; - for (lno_t j = 1; j < rowSize; ++j) { - const lno_t jp = j - 1; - if (e[jp] <= e[j]) continue; - Impl::kk_swap(e[jp], e[j]); - auto const vb = v + j * blocksize; - auto const vbp = v + jp * blocksize; - for (lno_t k = 0; k < blocksize; - ++k) // std::swap_ranges(vb, vb + blocksize, vbp); - Impl::kk_swap(vb[k], vbp[k]); - done = false; - } - } - }); + bsr_sorter); } // Sort a BSR matrix (like CRS but single values are replaced with contignous From e4ef7a953f871decbcdb05790bd86ef3bf4bf95e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 May 2022 15:47:16 -0600 Subject: [PATCH 123/261] Block SpGEMM: applying clang-format to modified files. Signed-off-by: Luc Berger-Vergiat --- src/common/KokkosKernels_BlockUtils.hpp | 3 +- src/common/KokkosKernels_Sorting.hpp | 41 ++++++++++++++----------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp index 59cb33ef7d..0c001ce115 100644 --- a/src/common/KokkosKernels_BlockUtils.hpp +++ b/src/common/KokkosKernels_BlockUtils.hpp @@ -130,7 +130,8 @@ KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, for (size_type col = 0; col < block_dim; ++col) { auto v = &dst[row_offset + col]; auto vb = valB + col; - for (const value_type *va = valA + row_offset, *end = va + block_dim; va < end; ++va) { + for (const value_type *va = valA + row_offset, *end = va + block_dim; + va < end; ++va) { Kokkos::atomic_add(v, (*va) * (*vb)); vb += block_dim; } diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 88f0ff6258..208688ae5b 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -582,16 +582,20 @@ KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { } template -struct sort_bsr_functor{ +struct sort_bsr_functor { using lno_t = typename entries_type::non_const_value_type; row_map_type rowmap; entries_type entries; - values_type values; - const lno_t blocksize; + values_type values; + const lno_t blocksize; - sort_bsr_functor(row_map_type rowmap_, entries_type entries_, values_type values_, const lno_t blocksize_) - : rowmap(rowmap_), entries(entries_), values(values_), blocksize(blocksize_) {} + sort_bsr_functor(row_map_type rowmap_, entries_type entries_, + values_type values_, const lno_t blocksize_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + blocksize(blocksize_) {} KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { @@ -603,15 +607,15 @@ struct sort_bsr_functor{ while (!done) { done = true; for (lno_t j = 1; j < rowSize; ++j) { - const lno_t jp = j - 1; - if (e[jp] <= e[j]) continue; - Impl::kk_swap(e[jp], e[j]); - auto const vb = v + j * blocksize; - auto const vbp = v + jp * blocksize; - for (lno_t k = 0; k < blocksize; - ++k) // std::swap_ranges(vb, vb + blocksize, vbp); - Impl::kk_swap(vb[k], vbp[k]); - done = false; + const lno_t jp = j - 1; + if (e[jp] <= e[j]) continue; + Impl::kk_swap(e[jp], e[j]); + auto const vb = v + j * blocksize; + auto const vbp = v + jp * blocksize; + for (lno_t k = 0; k < blocksize; + ++k) // std::swap_ranges(vb, vb + blocksize, vbp); + Impl::kk_swap(vb[k], vbp[k]); + done = false; } } } @@ -634,10 +638,11 @@ void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const lno_t blocksize = blockdim * blockdim; assert(values.extent(0) == entries.extent(0) * blocksize); - Impl::sort_bsr_functor bsr_sorter(rowmap, entries, values, blocksize); - Kokkos::parallel_for( - "sort_bsr_matrix", Kokkos::RangePolicy(0, numRows), - bsr_sorter); + Impl::sort_bsr_functor bsr_sorter( + rowmap, entries, values, blocksize); + Kokkos::parallel_for("sort_bsr_matrix", + Kokkos::RangePolicy(0, numRows), + bsr_sorter); } // Sort a BSR matrix (like CRS but single values are replaced with contignous From 545dad391971b83bbc4b8879c490f99a8b64e7b4 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 3 May 2022 17:31:17 -0600 Subject: [PATCH 124/261] Clean-up src: re-organizing the src directory Mostly moving headers into subdirectories and creating a cmake directory to stash .cmake file which is really an implementation detail of our build system. Signed-off-by: Luc Berger-Vergiat --- src/CMakeLists.txt | 2 +- src/{ => common}/KokkosKernels_Half.hpp | 0 src/{ => common}/Kokkos_ArithTraits.hpp | 0 .../Kokkos_InnerProductSpaceTraits.hpp | 0 src/kokkoskernels_eti.cmake | 185 ------------------ 5 files changed, 1 insertion(+), 186 deletions(-) rename src/{ => common}/KokkosKernels_Half.hpp (100%) rename src/{ => common}/Kokkos_ArithTraits.hpp (100%) rename src/{ => common}/Kokkos_InnerProductSpaceTraits.hpp (100%) delete mode 100644 src/kokkoskernels_eti.cmake diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 27f4c97aa5..13ae5cd2b4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,7 +52,7 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/tpls/KokkosBlas_Host_tpl.cpp) ENDIF() -include(kokkoskernels_eti.cmake) +include(cmake/kokkoskernels_eti.cmake) SET(ETI_HEADERS) #Build up a list of DECL, AVAIL, and INST macros diff --git a/src/KokkosKernels_Half.hpp b/src/common/KokkosKernels_Half.hpp similarity index 100% rename from src/KokkosKernels_Half.hpp rename to src/common/KokkosKernels_Half.hpp diff --git a/src/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp similarity index 100% rename from src/Kokkos_ArithTraits.hpp rename to src/common/Kokkos_ArithTraits.hpp diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/common/Kokkos_InnerProductSpaceTraits.hpp similarity index 100% rename from src/Kokkos_InnerProductSpaceTraits.hpp rename to src/common/Kokkos_InnerProductSpaceTraits.hpp diff --git a/src/kokkoskernels_eti.cmake b/src/kokkoskernels_eti.cmake deleted file mode 100644 index 04a6f412c9..0000000000 --- a/src/kokkoskernels_eti.cmake +++ /dev/null @@ -1,185 +0,0 @@ -# -# @FUNCTION: KOKKOSKERNELS_ETI_MAKE_LIST -# -# Create combinatorial sets of all enable ETI options. -# Consider a template T where A is an index type and B is a floating type. -# If we have two lists INDEX=INT;UINT64_T and FLOAT=FLOAT;DOUBLE, -# we can invoke the function to generate ETI for all combinations as -# KOKKOSKERNELS_ETI_MAKE_LIST(ETI_FOR_T TYPE_LISTS INDEX FLOAT) -# Upon returning from the function, the variable ETI_FOR_T -# will be a list containing four entries: -# ${ETI_FOR_T}=T_INT_FLOAT;T_INT_DOUBLE;T_UINT64_T_FLOAT;T_UINT64_T_DOUBLE; -# Additionally, each of entries in the list is itself a variable name -# containing the C++ ETI type list, e.g. -# ${T_INT_FLOAT}=int,float -# -# Usage:: -# -# KOKKOSKERNELS_ETI_MAKE_LIST( -# -# [TYPE_LISTS list1 [list2 ...]] -# ) -# ```` -# -# The name of the list output variable that will contain all generated ETI combinations -# -# ``[TYPE_LISTS list1 [[list2...]]`` -# -# The names of the lists containing ETI types. For a template T, -# then A will take every value in list1 and B will take every value in list2. -# The types listed here should be the CMake names like DOUBLE and EXECSPACE_SERIAL -FUNCTION(KOKKOSKERNELS_ETI_MAKE_LIST ETI_LIST_NAME) - CMAKE_PARSE_ARGUMENTS(ETI - "" - "" - "TYPE_LISTS" - ${ARGN} - ) - LIST(LENGTH ETI_TYPE_LISTS ETI_LIST_LENGTH) - MATH(EXPR RANGE_VARIABLE "${ETI_LIST_LENGTH} - 1") - FOREACH(IDX RANGE ${RANGE_VARIABLE}) - LIST(GET ETI_TYPE_LISTS ${IDX} LIST_NAME) - SET(LIST${IDX}_NAME ${LIST_NAME}) - ENDFOREACH() - FOREACH(TYPE0 ${${LIST0_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE0}) - SET(NAME0 ${ETI_LIST_NAME}_${TYPE0}) - SET(LIST0 ${TYPE0}) - IF (ETI_LIST_LENGTH GREATER 1) - FOREACH(TYPE1 ${${LIST1_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE1}) - SET(NAME1 ${NAME0}_${TYPE1}) - SET(LIST1 ${LIST0}) - LIST(APPEND LIST1 ${TYPE1}) - IF (ETI_LIST_LENGTH GREATER 2) - FOREACH(TYPE2 ${${LIST2_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE2}) - SET(NAME2 ${NAME1}_${TYPE2}) - SET(LIST2 ${LIST1}) - LIST(APPEND LIST2 ${TYPE2}) - IF (ETI_LIST_LENGTH GREATER 3) - FOREACH(TYPE3 ${${LIST3_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE3}) - SET(NAME3 ${NAME2}_${TYPE3}) - SET(LIST3 ${LIST2}) - LIST(APPEND LIST3 ${TYPE3}) - IF (ETI_LIST_LENGTH GREATER 4) - FOREACH(TYPE4 ${${LIST4_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE4}) - SET(NAME4 ${NAME3}_${TYPE4}) - SET(LIST4 ${LIST3}) - LIST(APPEND LIST4 ${TYPE4}) - IF (ETI_LIST_LENGTH GREATER 5) - FOREACH(TYPE4 ${${LIST4_NAME}}) - IF (KOKKOSKERNELS_INST_${TYPE5}) - SET(NAME5 ${NAME4}_${TYPE5}) - SET(LIST5 ${LIST4}) - LIST(APPEND LIST5 ${TYPE5}) - IF (ETI_LIST_LENGTH GREATER 6) - MESSAGE(FATAL_ERROR "Do not support ETI with more than 6 types") - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME5}) - SET(${NAME5} ${LIST5} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME4}) - SET(${NAME4} ${LIST4} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME3}) - SET(${NAME3} ${LIST3} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME2}) - SET(${NAME2} ${LIST2} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME1}) - SET(${NAME1} ${LIST1} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - ELSE() - #end of the eti list - LIST(APPEND ${ETI_LIST_NAME} ${NAME0}) - SET(${NAME0} ${LIST0} PARENT_SCOPE) - ENDIF() - ENDIF() - ENDFOREACH() - SET(${ETI_LIST_NAME} ${${ETI_LIST_NAME}} PARENT_SCOPE) -ENDFUNCTION(KOKKOSKERNELS_ETI_MAKE_LIST) - -MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) - CMAKE_PARSE_ARGUMENTS(ETI - "" - "HEADER_LIST;SOURCE_LIST" - "TYPE_LISTS;COMPONENTS" - ${ARGN}) - - STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) - SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") - SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") - SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") - - # if this is tied to particular components - # see whether those components are enabled - KOKKOSKERNELS_IS_ENABLED( - COMPONENTS ${ETI_COMPONENTS} - OUTPUT_VARIABLE ETI_COMP_IS_ENABLED - ) - - IF (ETI_COMP_IS_ENABLED) - MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") - KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) - FOREACH(ETI ${${FUNCTION_NAME}_eti}) - SET(MACRO_STRING "(") - FOREACH(TYPE_NAME ${${ETI}}) - STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") - ENDFOREACH() - STRING(APPEND MACRO_STRING ")") - STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) - #Make a single header file for all instances - LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") - SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") - #Make a different source file for each instance - SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") - SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") - SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) - ENDFOREACH() - ELSE() - MESSAGE(STATUS "Skipping ETI files for ${FUNCTION_NAME} because not all components are enabled") - ENDIF() - - SET(AVAIL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") - SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") - SET(DECL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") - SET(DECL_TEMPLATE "${DECL_HEADER}.in") - - STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") - STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") - - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} - ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) - - LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) - LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) -ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) From dd37035673668bdb9dfcccdac5dfc6309a74fd24 Mon Sep 17 00:00:00 2001 From: Kim Liegeois Date: Wed, 4 May 2022 07:31:53 -0600 Subject: [PATCH 125/261] Remove unneeded team_barrier --- .../sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp | 7 ++----- src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 4d779f9880..7fdf244fa7 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -198,7 +198,6 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); - member.team_barrier(); // Inner products TeamVectorGemv::invoke(member, 1, V_old, W, 0, @@ -209,7 +208,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, W); - member.team_barrier(); + member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { for (size_t i = 0; i < j + 1; ++i) { @@ -230,7 +229,6 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( } } - member.team_barrier(); // Finish writing to W TeamVectorDot::invoke(member, W, W, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), @@ -336,6 +334,7 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + member.team_barrier(); // Finish writing to X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { @@ -346,8 +345,6 @@ KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( } } - member.team_barrier(); // Finish writing to X - TeamVectorCopy::invoke(member, X, _X); member.team_barrier(); diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index cc54601d85..41ac90e61d 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -197,7 +197,6 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, Kokkos::make_pair(0, (int)j + 1)); - member.team_barrier(); // Inner products TeamGemv::invoke( member, 1, V_old, W, 0, H_old); @@ -206,7 +205,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( // Update TeamGemv::invoke( member, -1, V_old, H_old, 1, W); - member.team_barrier(); + member.team_barrier(); // Finish writing to W } if (handle.get_ortho_strategy() == 1) { for (size_t i = 0; i < j + 1; ++i) { @@ -227,7 +226,6 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( } } - member.team_barrier(); // Finish writing to W TeamDot::invoke(member, W, W, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), @@ -333,6 +331,7 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( member, 1, Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + member.team_barrier(); // Finish writing to X } if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { @@ -343,8 +342,6 @@ KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( } } - member.team_barrier(); // Finish writing to X - TeamCopy::invoke(member, X, _X); member.team_barrier(); From 5ffd7ed3a7124e4f742dbd802002c237f491e6f9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 4 May 2022 08:45:46 -0600 Subject: [PATCH 126/261] Cleanning-up src This will work better if the src/cmake directory is added to the commit... Signed-off-by: Luc Berger-Vergiat --- src/cmake/kokkoskernels_eti.cmake | 185 ++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 src/cmake/kokkoskernels_eti.cmake diff --git a/src/cmake/kokkoskernels_eti.cmake b/src/cmake/kokkoskernels_eti.cmake new file mode 100644 index 0000000000..04a6f412c9 --- /dev/null +++ b/src/cmake/kokkoskernels_eti.cmake @@ -0,0 +1,185 @@ +# +# @FUNCTION: KOKKOSKERNELS_ETI_MAKE_LIST +# +# Create combinatorial sets of all enable ETI options. +# Consider a template T where A is an index type and B is a floating type. +# If we have two lists INDEX=INT;UINT64_T and FLOAT=FLOAT;DOUBLE, +# we can invoke the function to generate ETI for all combinations as +# KOKKOSKERNELS_ETI_MAKE_LIST(ETI_FOR_T TYPE_LISTS INDEX FLOAT) +# Upon returning from the function, the variable ETI_FOR_T +# will be a list containing four entries: +# ${ETI_FOR_T}=T_INT_FLOAT;T_INT_DOUBLE;T_UINT64_T_FLOAT;T_UINT64_T_DOUBLE; +# Additionally, each of entries in the list is itself a variable name +# containing the C++ ETI type list, e.g. +# ${T_INT_FLOAT}=int,float +# +# Usage:: +# +# KOKKOSKERNELS_ETI_MAKE_LIST( +# +# [TYPE_LISTS list1 [list2 ...]] +# ) +# ```` +# +# The name of the list output variable that will contain all generated ETI combinations +# +# ``[TYPE_LISTS list1 [[list2...]]`` +# +# The names of the lists containing ETI types. For a template T, +# then A will take every value in list1 and B will take every value in list2. +# The types listed here should be the CMake names like DOUBLE and EXECSPACE_SERIAL +FUNCTION(KOKKOSKERNELS_ETI_MAKE_LIST ETI_LIST_NAME) + CMAKE_PARSE_ARGUMENTS(ETI + "" + "" + "TYPE_LISTS" + ${ARGN} + ) + LIST(LENGTH ETI_TYPE_LISTS ETI_LIST_LENGTH) + MATH(EXPR RANGE_VARIABLE "${ETI_LIST_LENGTH} - 1") + FOREACH(IDX RANGE ${RANGE_VARIABLE}) + LIST(GET ETI_TYPE_LISTS ${IDX} LIST_NAME) + SET(LIST${IDX}_NAME ${LIST_NAME}) + ENDFOREACH() + FOREACH(TYPE0 ${${LIST0_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE0}) + SET(NAME0 ${ETI_LIST_NAME}_${TYPE0}) + SET(LIST0 ${TYPE0}) + IF (ETI_LIST_LENGTH GREATER 1) + FOREACH(TYPE1 ${${LIST1_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE1}) + SET(NAME1 ${NAME0}_${TYPE1}) + SET(LIST1 ${LIST0}) + LIST(APPEND LIST1 ${TYPE1}) + IF (ETI_LIST_LENGTH GREATER 2) + FOREACH(TYPE2 ${${LIST2_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE2}) + SET(NAME2 ${NAME1}_${TYPE2}) + SET(LIST2 ${LIST1}) + LIST(APPEND LIST2 ${TYPE2}) + IF (ETI_LIST_LENGTH GREATER 3) + FOREACH(TYPE3 ${${LIST3_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE3}) + SET(NAME3 ${NAME2}_${TYPE3}) + SET(LIST3 ${LIST2}) + LIST(APPEND LIST3 ${TYPE3}) + IF (ETI_LIST_LENGTH GREATER 4) + FOREACH(TYPE4 ${${LIST4_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE4}) + SET(NAME4 ${NAME3}_${TYPE4}) + SET(LIST4 ${LIST3}) + LIST(APPEND LIST4 ${TYPE4}) + IF (ETI_LIST_LENGTH GREATER 5) + FOREACH(TYPE4 ${${LIST4_NAME}}) + IF (KOKKOSKERNELS_INST_${TYPE5}) + SET(NAME5 ${NAME4}_${TYPE5}) + SET(LIST5 ${LIST4}) + LIST(APPEND LIST5 ${TYPE5}) + IF (ETI_LIST_LENGTH GREATER 6) + MESSAGE(FATAL_ERROR "Do not support ETI with more than 6 types") + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME5}) + SET(${NAME5} ${LIST5} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME4}) + SET(${NAME4} ${LIST4} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME3}) + SET(${NAME3} ${LIST3} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME2}) + SET(${NAME2} ${LIST2} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME1}) + SET(${NAME1} ${LIST1} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + ELSE() + #end of the eti list + LIST(APPEND ${ETI_LIST_NAME} ${NAME0}) + SET(${NAME0} ${LIST0} PARENT_SCOPE) + ENDIF() + ENDIF() + ENDFOREACH() + SET(${ETI_LIST_NAME} ${${ETI_LIST_NAME}} PARENT_SCOPE) +ENDFUNCTION(KOKKOSKERNELS_ETI_MAKE_LIST) + +MACRO(KOKKOSKERNELS_GENERATE_ETI FUNCTION_NAME SUBFOLDER) + CMAKE_PARSE_ARGUMENTS(ETI + "" + "HEADER_LIST;SOURCE_LIST" + "TYPE_LISTS;COMPONENTS" + ${ARGN}) + + STRING(TOUPPER "${FUNCTION_NAME}" UPPER_NAME) + SET(ETI_DECL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_DECL") + SET(ETI_AVAIL_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_AVAIL") + SET(ETI_INST_MACRO "KOKKOS${UPPER_NAME}_ETI_SPEC_INST") + + # if this is tied to particular components + # see whether those components are enabled + KOKKOSKERNELS_IS_ENABLED( + COMPONENTS ${ETI_COMPONENTS} + OUTPUT_VARIABLE ETI_COMP_IS_ENABLED + ) + + IF (ETI_COMP_IS_ENABLED) + MESSAGE(STATUS "Creating ETI files for ${FUNCTION_NAME}") + KOKKOSKERNELS_ETI_MAKE_LIST(${FUNCTION_NAME}_eti TYPE_LISTS ${ETI_TYPE_LISTS}) + FOREACH(ETI ${${FUNCTION_NAME}_eti}) + SET(MACRO_STRING "(") + FOREACH(TYPE_NAME ${${ETI}}) + STRING(APPEND MACRO_STRING "${${TYPE_NAME}_CPP_TYPE},") + ENDFOREACH() + STRING(APPEND MACRO_STRING ")") + STRING(REPLACE ",)" ")" MACRO_STRING ${MACRO_STRING}) + #Make a single header file for all instances + LIST(APPEND ${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + LIST(APPEND ${UPPER_NAME}_ETI_AVAIL_LIST "${ETI_AVAIL_MACRO}${MACRO_STRING}") + SET(${UPPER_NAME}_ETI_DECL_LIST "${ETI_DECL_MACRO}${MACRO_STRING}") + #Make a different source file for each instance + SET(INST_SOURCE "impl/generated_specializations_cpp/${SUBFOLDER}/${ETI}.cpp") + SET(INST_TEMPLATE "impl/generated_specializations_cpp/${SUBFOLDER}/Kokkos${FUNCTION_NAME}_eti_spec_inst.cpp.in") + SET(${UPPER_NAME}_ETI_INST_BLOCK "${ETI_INST_MACRO}${MACRO_STRING}") + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${INST_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) + LIST(APPEND ${ETI_SOURCE_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${INST_SOURCE}) + ENDFOREACH() + ELSE() + MESSAGE(STATUS "Skipping ETI files for ${FUNCTION_NAME} because not all components are enabled") + ENDIF() + + SET(AVAIL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_avail.hpp") + SET(AVAIL_TEMPLATE "${AVAIL_HEADER}.in") + SET(DECL_HEADER "impl/generated_specializations_hpp/Kokkos${FUNCTION_NAME}_eti_spec_decl.hpp") + SET(DECL_TEMPLATE "${DECL_HEADER}.in") + + STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_INST_BLOCK "${${UPPER_NAME}_ETI_INST_LIST}") + STRING(REPLACE ";" "\n" ${UPPER_NAME}_ETI_AVAIL_BLOCK "${${UPPER_NAME}_ETI_AVAIL_LIST}") + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${DECL_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/${AVAIL_TEMPLATE} + ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) + + LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${AVAIL_HEADER}) + LIST(APPEND ${ETI_HEADER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/${DECL_HEADER}) +ENDMACRO(KOKKOSKERNELS_GENERATE_ETI) From 192ac7b096c4d35203d5a2a1f3a5d265226f7080 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 5 May 2022 15:16:12 -0600 Subject: [PATCH 127/261] Add ETI to SpAdd (symbolic and numeric) --- src/CMakeLists.txt | 16 + ...sSparse_spadd_numeric_eti_spec_inst.cpp.in | 53 + ...Sparse_spadd_symbolic_eti_spec_inst.cpp.in | 53 + ...Sparse_spadd_numeric_eti_spec_avail.hpp.in | 51 + ...sSparse_spadd_numeric_eti_spec_decl.hpp.in | 51 + ...parse_spadd_symbolic_eti_spec_avail.hpp.in | 51 + ...Sparse_spadd_symbolic_eti_spec_decl.hpp.in | 51 + .../KokkosSparse_spadd_tpl_spec_avail.hpp | 69 ++ .../tpls/KokkosSparse_spadd_tpl_spec_decl.hpp | 52 + src/sparse/KokkosSparse_spadd.hpp | 912 ++---------------- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 306 ++++++ .../impl/KokkosSparse_spadd_numeric_spec.hpp | 244 +++++ .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 635 ++++++++++++ .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 189 ++++ 14 files changed, 1926 insertions(+), 807 deletions(-) create mode 100644 src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in create mode 100644 src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp create mode 100644 src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp create mode 100644 src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp create mode 100644 src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp create mode 100644 src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp create mode 100644 src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13ae5cd2b4..ef591da4b3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -374,6 +374,22 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +# NOTE: SpAdd symbolic doesn't use scalars directly, +# but it needs the type to use handles. +KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_symbolic spadd_symbolic + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_numeric spadd_numeric + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..1ffa61b1d5 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spadd_numeric_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..aa08a1c6c7 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spadd_symbolic_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..b47c423974 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..fd971bc314 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..b38552c34a --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..ea001cb72b --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp new file mode 100644 index 0000000000..9a65bc3656 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp @@ -0,0 +1,69 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSPARSE_SPADD_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +// +template +struct spadd_symbolic_tpl_spec_avail { + enum : bool { value = false }; +}; + +template +struct spadd_numeric_tpl_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp new file mode 100644 index 0000000000..d9f6a19911 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl {} +} // namespace KokkosSparse + +#endif diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 6db63455be..fbc2e0c595 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -46,460 +46,13 @@ #define _KOKKOS_SPADD_HPP #include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_Sorting.hpp" -#include "Kokkos_ArithTraits.hpp" +#include "KokkosKernels_helpers.hpp" +#include "KokkosSparse_spadd_symbolic_spec.hpp" +#include "KokkosSparse_spadd_numeric_spec.hpp" namespace KokkosSparse { namespace Experimental { -/* -Unsorted symbolic algorithm notes: --Only needs to sort and merge indices once, in symbolic (sorting is expensive) --Can't afford to allocate dense Views for indices/values (assume number of -columns is very large) -Want numeric() to know exactly where each A/B entry -belongs in Ccolinds/Cvalues -To accomplish all of these, symbolic() computes -arrays Apos and Bpos (both are type clno_nnz_view_t_, and have same length as -a_entries and b_entries respectively) -Apos/Bpos are saved in the handle -Apos -and Bpos each contain the final index within C row where the A/B entry belongs --See UnsortedNumericSumFunctor below for the usage of Apos/Bpos -*/ - -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value - -// get C rowmap for sorted input -template -struct SortedCountEntriesRange { - SortedCountEntriesRange(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const AColIndsT& Acolinds_, - const typename BRowPtrsT::const_type& Browptrs_, - const BColIndsT& Bcolinds_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowcounts(Crowcounts_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type numEntries = 0; - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - numEntries++; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - while (Acol == Ccol) - Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); - while (Bcol == Ccol) - Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); - } - Crowcounts(i) = numEntries; - } - - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const AColIndsT Acolinds; - const typename BRowPtrsT::const_type Browptrs; - const BColIndsT Bcolinds; - CRowPtrsT Crowcounts; -}; - -template -struct SortedCountEntriesTeam { - SortedCountEntriesTeam(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const AColIndsT& Acolinds_, - const typename BRowPtrsT::const_type& Browptrs_, - const BColIndsT& Bcolinds_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowcounts(Crowcounts_) {} - - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - - KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type numEntries = 0; - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - numEntries++; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - while (Acol == Ccol) - Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); - while (Bcol == Ccol) - Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); - } - Crowcounts(i) = numEntries; - } - - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - ordinal_type i = t.league_rank() * t.team_size() + t.team_rank(); - if (i >= nrows) return; - ordinal_type* allScratch = - (ordinal_type*)t.team_shmem().get_shmem(totalShared); - ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread; - ordinal_type Arowstart = Arowptrs(i); - ordinal_type Arowlen = Arowptrs(i + 1) - Arowstart; - ordinal_type Browstart = Browptrs(i); - ordinal_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type n = Arowlen + Browlen; - if (n > sharedPerThread) { - // fall back to slow serial method - Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); }); - return; - } - if (n == 0) { - Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; }); - return; - } - // Figure out the number of bitonic steps: ceil(log2(n)) - ordinal_type npot = 1; - ordinal_type levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - // Copy A and B entries to scratch - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(t, Arowlen), - [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); }); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen), - [&](ordinal_type j) { - scratch[npot - 1 - j] = Bcolinds(Browstart + j); - }); - // Fill space between A and B with ORDINAL_MAX, - // to maintain a valid bitonic sequence of power-of-two length - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) { - scratch[Arowlen + j] = Kokkos::ArithTraits::max(); - }); - // npot = 2^levels - for (ordinal_type level = 0; level < levels; level++) { - // npot/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1), - [&](const ordinal_type j) { - ordinal_type boxSize = npot >> level; - // Which box contains this thread? - // box = (j / boxSize), and boxSize = - // 2^(levels-level), so box = j * 2^(level-levels) - // = j >> (levels - level) - ordinal_type boxID = (j * 2) >> (levels - level); - // boxStart = boxID * boxSize = boxID * - // 2^(levels-level) = boxID << (levels-level) - ordinal_type boxStart = boxID << (levels - level); - ordinal_type boxOffset = j - boxID * boxSize / 2; - ordinal_type elem1 = boxStart + boxOffset; - ordinal_type elem2 = elem1 + (boxSize >> 1); - if (scratch[elem2] < scratch[elem1]) { - ordinal_type temp = scratch[elem1]; - scratch[elem1] = scratch[elem2]; - scratch[elem2] = temp; - } - }); - } - // Finally, count the number of distinct entries (this is #rising edges + 1) - ordinal_type risingEdges; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(t, n - 1), - [&](const ordinal_type j, ordinal_type& lcount) { - if (scratch[j] != scratch[j + 1]) lcount++; - }, - risingEdges); - Kokkos::single(Kokkos::PerThread(t), - [&]() { Crowcounts(i) = risingEdges + 1; }); - } - - size_t team_shmem_size(int teamSize) const { - return sharedPerThread * sizeof(ordinal_type) * teamSize; - } - - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const AColIndsT Acolinds; - const typename BRowPtrsT::const_type Browptrs; - const BColIndsT Bcolinds; - CRowPtrsT Crowcounts; - int sharedPerThread; // Shared for each thread, measured in - // sizeof(ordinal_type) - int totalShared; // Shared for whole team, measured in bytes -}; - -// get upper bound for C entries per row (assumes worst case, that entries in A -// and B on each row are disjoint) -template -struct UnsortedEntriesUpperBound { - UnsortedEntriesUpperBound(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const typename BRowPtrsT::const_type& Browptrs_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowcounts(Crowcounts_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - Crowcounts(i) = - (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i)); - if (i == nrows - 1) { - // last workitem also zeros the one-past-end entry of row counts, so - // that prefix sum is correct - Crowcounts(nrows) = 0; - } - } - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const typename BRowPtrsT::const_type Browptrs; - CRowPtrsT Crowcounts; -}; - -// Unsorted symbolic: new functors: -// -compute uncompressed C (entries only, no values) -// -sort uncompressed C entries within row, while permuting A union B -// permutation array -compress sorted C entries and A,B perm arrays at the same -// time, which produces Crowcounts value -// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C -// entries) Output: C uncompressed colinds -template -struct UnmergedSumFunctor { - UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, - const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_, - const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_, - const CcolindsT& Ccolinds_, const CcolindsT& ABperm_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowptrs(Crowptrs_), - Ccolinds(Ccolinds_), - ABperm(ABperm_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type inserted = 0; - size_type crowstart = Crowptrs(i); - size_type arowstart = Arowptrs(i); - size_type arowlen = Arowptrs(i + 1) - arowstart; - size_type browstart = Browptrs(i); - size_type browlen = Browptrs(i + 1) - browstart; - // Insert all A entries, then all B entries - for (size_type j = 0; j < arowlen; j++) { - Ccolinds(crowstart + inserted) = Acolinds(arowstart + j); - ABperm(crowstart + inserted) = j; - inserted++; - } - for (size_type j = 0; j < browlen; j++) { - Ccolinds(crowstart + inserted) = Bcolinds(browstart + j); - // tell A and B permutation values apart by adding arowlen as a bias to B - // values - ABperm(crowstart + inserted) = j + arowlen; - inserted++; - } - } - ordinal_type nrows; - const ArowptrsT Arowptrs; - const AcolindsT Acolinds; - const BrowptrsT Browptrs; - const BcolindsT Bcolinds; - const CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT ABperm; -}; - -template -struct MergeEntriesFunctor { - MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, - const BrowptrsT& Browptrs_, const CrowptrsT& Crowptrs_, - const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_, - const CcolindsT& ABperm_, const CcolindsT& Apos_, - const CcolindsT& Bpos_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Crowcounts(Crowcounts_), - Ccolinds(Ccolinds_), - ABperm(ABperm_), - Apos(Apos_), - Bpos(Bpos_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type CrowStart = Crowptrs(i); - size_type CrowEnd = Crowptrs(i + 1); - if (CrowEnd == CrowStart) { - Crowcounts(i) = 0; - return; - } - size_type ArowStart = Arowptrs(i); - size_type ArowNum = Arowptrs(i + 1) - ArowStart; - size_type BrowStart = Browptrs(i); - ordinal_type CFit = 0; // counting through merged C indices (within row) - for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) { - if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) { - // This is a different column than the previous entry, and is not the - // first entry. This means that this is the first occurence of a unique - // column. - CFit++; - } - size_type permVal = ABperm(Cit); - if (permVal < ArowNum) { - // Entry belongs to A - ordinal_type Aindex = permVal; - // The Aindex'th entry in row i of A will be added into the CFit'th - // entry in C - Apos(ArowStart + Aindex) = CFit; - } else { - // Entry belongs to B - ordinal_type Bindex = permVal - ArowNum; - // The Bindex'th entry in row i of B will be added into the CFit'th - // entry in C - Bpos(BrowStart + Bindex) = CFit; - } - } - // At end of the row, know how many entries are in merged C. - // Right now, CFit is the index of the last Apos/Bpos, - // so adding one gives the total number of entries. - Crowcounts(i) = CFit + 1; - } - ordinal_type nrows; - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - CrowptrsT Crowcounts; - CcolindsT Ccolinds; - const CcolindsT ABperm; - CcolindsT Apos; - CcolindsT Bpos; -}; - -// Run SortedCountEntries: non-GPU, always uses the RangePolicy version. -template -void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = - nullptr) { - using size_type = typename KernelHandle::size_type; - using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using range_type = Kokkos::RangePolicy; - auto nrows = c_rowmap.extent(0) - 1; - SortedCountEntriesRange - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); -} - -// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending -// on average nz per row (a runtime decision) -template -void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = - nullptr) { - using size_type = typename KernelHandle::size_type; - using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using RangePol = Kokkos::RangePolicy; - using TeamPol = Kokkos::TeamPolicy; - auto nrows = c_rowmap.extent(0) - 1; - size_type c_est_nnz = - 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; - if (c_est_nnz <= 512) { - // Convert c_est_nnz to a power of 2 - size_type pot_est_nnz = 1; - while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2; - // Estimate max number of uncompressed entries in each row of C - int vector_length = 1; - int vector_length_max = - KokkosKernels::Impl::kk_get_max_vector_size(); - while (vector_length * 2 <= vector_length_max && - (size_type)vector_length * 2 <= pot_est_nnz) { - vector_length *= 2; - } - SortedCountEntriesTeam - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - countEntries.sharedPerThread = pot_est_nnz; - // compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); - testPolicy.set_scratch_size( - 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - int team_size = testPolicy.team_size_recommended(countEntries, - Kokkos::ParallelForTag()); - // construct real policy - int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); - policy.set_scratch_size( - 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - countEntries.totalShared = - countEntries.sharedPerThread * team_size * sizeof(ordinal_type); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy, - countEntries); - } else { - SortedCountEntriesRange - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - RangePol(0, nrows), countEntries); - } -} - // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. template ::value, - "add_symbolic: C size_type must not be const"); - static_assert( - SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: A entry type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: B entry type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename clno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: C entry type must match KernelHandle entry type (aka " - "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C entry type must not be const"); - // symbolic just needs to compute c_rowmap - // easy for sorted, but for unsorted is easiest to just compute the whole sum - auto addHandle = handle->get_spadd_handle(); - if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { - // Have 0 rows, so nothing to do except set #nnz to 0 - addHandle->set_c_nnz(0); - // If c_rowmap has a single entry, it must be 0 - if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0); - addHandle->set_call_symbolic(); - return; - } - ordinal_type nrows = a_rowmap.extent(0) - 1; - typedef Kokkos::RangePolicy range_type; - if (addHandle->is_input_sorted()) { - runSortedCountEntries( - a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); - } else { - // note: scoping individual parts of the process to free views sooner, - // minimizing peak memory usage run the unsorted c_rowmap upper bound - // functor (just adds together A and B entry counts row by row) - clno_row_view_t_ c_rowmap_upperbound( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "C row counts upper bound"), - nrows + 1); - size_type c_nnz_upperbound = 0; - { - UnsortedEntriesUpperBound - countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", - range_type(0, nrows), countEntries); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(c_nnz_upperbound, - Kokkos::subview(c_rowmap_upperbound, nrows)); - } - clno_nnz_view_t_ c_entries_uncompressed( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "C entries uncompressed"), - c_nnz_upperbound); - clno_nnz_view_t_ ab_perm( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "A and B permuted entry indices"), - c_nnz_upperbound); - // compute the unmerged sum - UnmergedSumFunctor - unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries, - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", - range_type(0, nrows), unmergedSum); - // sort the unmerged sum - KokkosKernels::sort_crs_matrix( - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - clno_nnz_view_t_ a_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), - a_entries.extent(0)); - clno_nnz_view_t_ b_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), - b_entries.extent(0)); - // merge the entries and compute Apos/Bpos, as well as Crowcounts - { - MergeEntriesFunctor - mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap, - c_entries_uncompressed, ab_perm, a_pos, b_pos); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", - range_type(0, nrows), mergeEntries); - // compute actual c_rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); - } - addHandle->set_a_b_pos(a_pos, b_pos); - } - // provide the number of NNZ in C to user through handle - size_type cmax; - Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); - addHandle->set_c_nnz(cmax); - addHandle->set_call_symbolic(); - addHandle->set_call_numeric(false); - // this fence is for accurate timing from host - execution_space().fence(); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_rowmap; + KokkosSparse::Impl::SPADD_SYMBOLIC:: + spadd_symbolic(handle, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); } -template -struct SortedNumericSumFunctor { - using CscalarT = typename CvaluesT::non_const_value_type; - - SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, - const BrowptrsT& Browptrs_, - const CrowptrsT& Crowptrs_, - const AcolindsT& Acolinds_, - const BcolindsT& Bcolinds_, - const CcolindsT& Ccolinds_, const AvaluesT& Avalues_, - const BvaluesT& Bvalues_, const CvaluesT& Cvalues_, - const AscalarT alpha_, const BscalarT beta_) - : Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Acolinds(Acolinds_), - Bcolinds(Bcolinds_), - Ccolinds(Ccolinds_), - Avalues(Avalues_), - Bvalues(Bvalues_), - Cvalues(Cvalues_), - alpha(alpha_), - beta(beta_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - size_type Coffset = Crowptrs(i); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - CscalarT accum = Kokkos::ArithTraits::zero(); - while (Acol == Ccol) { - accum += static_cast(alpha * Avalues(Arowstart + ai)); - ai++; - if (ai == Arowlen) - Acol = ORDINAL_MAX; - else - Acol = Acolinds(Arowstart + ai); - } - while (Bcol == Ccol) { - accum += static_cast(beta * Bvalues(Browstart + bi)); - bi++; - if (bi == Browlen) - Bcol = ORDINAL_MAX; - else - Bcol = Bcolinds(Browstart + bi); - } - Ccolinds(Coffset) = Ccol; - Cvalues(Coffset) = accum; - Coffset++; - } - } - - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - const AcolindsT Acolinds; - const BcolindsT Bcolinds; - CcolindsT Ccolinds; - const AvaluesT Avalues; - const BvaluesT Bvalues; - CvaluesT Cvalues; - const AscalarT alpha; - const BscalarT beta; -}; - -template -struct UnsortedNumericSumFunctor { - using CscalarT = typename CvaluesT::non_const_value_type; - - UnsortedNumericSumFunctor( - const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_, - const CrowptrsT Crowptrs_, const AcolindsT Acolinds_, - const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_, - const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_, - const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_) - : Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Acolinds(Acolinds_), - Bcolinds(Bcolinds_), - Ccolinds(Ccolinds_), - Avalues(Avalues_), - Bvalues(Bvalues_), - Cvalues(Cvalues_), - alpha(alpha_), - beta(beta_), - Apos(Apos_), - Bpos(Bpos_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type CrowStart = Crowptrs(i); - size_type CrowEnd = Crowptrs(i + 1); - size_type ArowStart = Arowptrs(i); - size_type ArowEnd = Arowptrs(i + 1); - size_type BrowStart = Browptrs(i); - size_type BrowEnd = Browptrs(i + 1); - for (size_type j = CrowStart; j < CrowEnd; j++) - Cvalues(j) = Kokkos::ArithTraits::zero(); - // add in A entries, while setting C colinds - for (size_type j = ArowStart; j < ArowEnd; j++) { - Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j); - Ccolinds(CrowStart + Apos(j)) = Acolinds(j); - } - // add in B entries, while setting C colinds - for (size_type j = BrowStart; j < BrowEnd; j++) { - Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j); - Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j); - } - } - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - const AcolindsT Acolinds; - const BcolindsT Bcolinds; - CcolindsT Ccolinds; - const AvaluesT Avalues; - const BvaluesT Bvalues; - CvaluesT Cvalues; - const AscalarT alpha; - const BscalarT beta; - const CcolindsT Apos; - const CcolindsT Bpos; -}; - template -void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap, +void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha, const blno_row_view_t_ b_rowmap, @@ -802,89 +120,69 @@ void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap, const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta, const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries, cscalar_nnz_view_t_ c_values) { - typedef typename KernelHandle::size_type size_type; - typedef typename KernelHandle::nnz_lno_t ordinal_type; - typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; - // Check that A/B/C data types match KernelHandle types, and that C data types - // are nonconst (doesn't matter if A/B types are const) - static_assert(SAME_TYPE(ascalar_t_, scalar_type), - "A scalar type must match handle scalar type"); - static_assert(SAME_TYPE(bscalar_t_, scalar_type), - "B scalar type must match handle scalar type"); - static_assert(SAME_TYPE(typename alno_row_view_t_::value_type, size_type), - "add_symbolic: A size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_row_view_t_::value_type, size_type), - "add_symbolic: B size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert( - SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), - "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(SAME_TYPE(typename alno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: A entry type must match KernelHandle entry type " - "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: B entry type must match KernelHandle entry type " - "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename clno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: C entry type must match KernelHandle entry type " - "(aka nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C entry type must not be const"); - static_assert( - SAME_TYPE(typename ascalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: A scalar type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename bscalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: B scalar type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename cscalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: C scalar type must match KernelHandle entry type (aka " - "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C scalar type must not be const"); - typedef Kokkos::RangePolicy range_type; - auto addHandle = kernel_handle->get_spadd_handle(); - // rowmap length can be 0 or 1 if #rows is 0. - // Otherwise, it's always #rows+1. - if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { - addHandle->set_call_numeric(); - return; - } - ordinal_type nrows = a_rowmap.extent(0) - 1; - if (addHandle->is_input_sorted()) { - SortedNumericSumFunctor< - size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_, - clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_, - ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, - ascalar_t_, bscalar_t_> - sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, - c_entries, a_values, b_values, c_values, alpha, beta); - Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", - range_type(0, nrows), sortedNumeric); - } else { - // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C - // entries and values - UnsortedNumericSumFunctor< - size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_, - clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_, - ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, - ascalar_t_, bscalar_t_> - unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, - c_entries, a_values, b_values, c_values, alpha, beta, - addHandle->get_a_pos(), addHandle->get_b_pos()); - Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", - range_type(0, nrows), unsortedNumeric); - } - addHandle->set_call_numeric(); - // this fence is for accurate timing from host - execution_space().fence(); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_values; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_values; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_values; + KokkosSparse::Impl::SPADD_NUMERIC< + KernelHandle, Internal_a_rowmap, Internal_a_entries, Internal_a_values, + Internal_b_rowmap, Internal_b_entries, Internal_b_values, + Internal_c_rowmap, Internal_c_entries, Internal_c_values>:: + spadd_numeric( + handle, alpha, Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); } } // namespace Experimental diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp new file mode 100644 index 0000000000..b3008ff716 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -0,0 +1,306 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOS_SPADD_NUMERIC_IMPL_HPP +#define _KOKKOS_SPADD_NUMERIC_IMPL_HPP + +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_Sorting.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosSparse { +namespace Impl { + +template +struct SortedNumericSumFunctor { + using CscalarT = typename CvaluesT::non_const_value_type; + + SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, + const BrowptrsT& Browptrs_, + const CrowptrsT& Crowptrs_, + const AcolindsT& Acolinds_, + const BcolindsT& Bcolinds_, + const CcolindsT& Ccolinds_, const AvaluesT& Avalues_, + const BvaluesT& Bvalues_, const CvaluesT& Cvalues_, + const AscalarT alpha_, const BscalarT beta_) + : Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Acolinds(Acolinds_), + Bcolinds(Bcolinds_), + Ccolinds(Ccolinds_), + Avalues(Avalues_), + Bvalues(Bvalues_), + Cvalues(Cvalues_), + alpha(alpha_), + beta(beta_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + size_type Coffset = Crowptrs(i); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + CscalarT accum = Kokkos::ArithTraits::zero(); + while (Acol == Ccol) { + accum += static_cast(alpha * Avalues(Arowstart + ai)); + ai++; + if (ai == Arowlen) + Acol = ORDINAL_MAX; + else + Acol = Acolinds(Arowstart + ai); + } + while (Bcol == Ccol) { + accum += static_cast(beta * Bvalues(Browstart + bi)); + bi++; + if (bi == Browlen) + Bcol = ORDINAL_MAX; + else + Bcol = Bcolinds(Browstart + bi); + } + Ccolinds(Coffset) = Ccol; + Cvalues(Coffset) = accum; + Coffset++; + } + } + + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const CrowptrsT Crowptrs; + const AcolindsT Acolinds; + const BcolindsT Bcolinds; + CcolindsT Ccolinds; + const AvaluesT Avalues; + const BvaluesT Bvalues; + CvaluesT Cvalues; + const AscalarT alpha; + const BscalarT beta; +}; + +template +struct UnsortedNumericSumFunctor { + using CscalarT = typename CvaluesT::non_const_value_type; + + UnsortedNumericSumFunctor( + const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_, + const CrowptrsT Crowptrs_, const AcolindsT Acolinds_, + const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_, + const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_, + const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_) + : Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Acolinds(Acolinds_), + Bcolinds(Bcolinds_), + Ccolinds(Ccolinds_), + Avalues(Avalues_), + Bvalues(Bvalues_), + Cvalues(Cvalues_), + alpha(alpha_), + beta(beta_), + Apos(Apos_), + Bpos(Bpos_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type CrowStart = Crowptrs(i); + size_type CrowEnd = Crowptrs(i + 1); + size_type ArowStart = Arowptrs(i); + size_type ArowEnd = Arowptrs(i + 1); + size_type BrowStart = Browptrs(i); + size_type BrowEnd = Browptrs(i + 1); + for (size_type j = CrowStart; j < CrowEnd; j++) + Cvalues(j) = Kokkos::ArithTraits::zero(); + // add in A entries, while setting C colinds + for (size_type j = ArowStart; j < ArowEnd; j++) { + Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j); + Ccolinds(CrowStart + Apos(j)) = Acolinds(j); + } + // add in B entries, while setting C colinds + for (size_type j = BrowStart; j < BrowEnd; j++) { + Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j); + Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j); + } + } + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const CrowptrsT Crowptrs; + const AcolindsT Acolinds; + const BcolindsT Bcolinds; + CcolindsT Ccolinds; + const AvaluesT Avalues; + const BvaluesT Bvalues; + CvaluesT Cvalues; + const AscalarT alpha; + const BscalarT beta; + const CcolindsT Apos; + const CcolindsT Bpos; +}; + +// Helper macro to check that two types are the same (ignoring const) +#define SAME_TYPE(A, B) \ + std::is_same::type, \ + typename std::remove_const::type>::value + +template +void spadd_numeric_impl( + KernelHandle* kernel_handle, const alno_row_view_t a_rowmap, + const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values, + const ascalar_t alpha, const blno_row_view_t b_rowmap, + const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values, + const bscalar_t beta, const clno_row_view_t c_rowmap, + clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) { + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_lno_t ordinal_type; + typedef typename KernelHandle::nnz_scalar_t scalar_type; + typedef + typename KernelHandle::SPADDHandleType::execution_space execution_space; + // Check that A/B/C data types match KernelHandle types, and that C data types + // are nonconst (doesn't matter if A/B types are const) + static_assert(SAME_TYPE(ascalar_t, scalar_type), + "A scalar type must match handle scalar type"); + static_assert(SAME_TYPE(bscalar_t, scalar_type), + "B scalar type must match handle scalar type"); + static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type), + "add_symbolic: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type), + "add_symbolic: B size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type), + "add_symbolic: C size_type must match KernelHandle size_type)"); + static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: A entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: B entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: C entry type must match KernelHandle entry type " + "(aka nnz_lno_t)"); + static_assert(std::is_same::value, + "add_symbolic: C entry type must not be const"); + static_assert( + SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: A scalar type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: B scalar type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: C scalar type must match KernelHandle entry type (aka " + "nnz_lno_t)"); + static_assert(std::is_same::value, + "add_symbolic: C scalar type must not be const"); + typedef Kokkos::RangePolicy range_type; + auto addHandle = kernel_handle->get_spadd_handle(); + // rowmap length can be 0 or 1 if #rows is 0. + // Otherwise, it's always #rows+1. + if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { + addHandle->set_call_numeric(); + return; + } + ordinal_type nrows = a_rowmap.extent(0) - 1; + if (addHandle->is_input_sorted()) { + SortedNumericSumFunctor + sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, + c_entries, a_values, b_values, c_values, alpha, beta); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", + range_type(0, nrows), sortedNumeric); + } else { + // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C + // entries and values + UnsortedNumericSumFunctor + unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, + c_entries, a_values, b_values, c_values, alpha, beta, + addHandle->get_a_pos(), addHandle->get_b_pos()); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", + range_type(0, nrows), unsortedNumeric); + } + addHandle->set_call_numeric(); +} + +#undef SAME_TYPE + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp new file mode 100644 index 0000000000..b1d5f6a04a --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -0,0 +1,244 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosSparse_spadd_numeric_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spadd_numeric_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) + +template ::value, + bool eti_spec_avail = spadd_numeric_eti_spec_avail< + KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, + b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, + c_lno_view_t, c_scalar_view_t>::value> +struct SPADD_NUMERIC { + static void spadd_numeric(KernelHandle *handle, + typename a_scalar_view_t::const_value_type alpha, + a_size_view_t row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + typename b_scalar_view_t::const_value_type beta, + b_size_view_t row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, c_size_view_t row_mapC, + c_lno_view_t entriesC, c_scalar_view_t valuesC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct SPADD_NUMERIC { + static void spadd_numeric(KernelHandle *handle, + typename a_scalar_view_t::const_value_type alpha, + a_size_view_t row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + typename b_scalar_view_t::const_value_type beta, + b_size_view_t row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, c_size_view_t row_mapC, + c_lno_view_t entriesC, c_scalar_view_t valuesC) { + spadd_numeric_impl(handle, row_mapA, entriesA, valuesA, alpha, row_mapB, + entriesB, valuesB, beta, row_mapC, entriesC, valuesC); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPADD_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPADD_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include +#include + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp new file mode 100644 index 0000000000..2131cec751 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -0,0 +1,635 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP +#define _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP + +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_Sorting.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosSparse { +namespace Impl { + +// Helper macro to check that two types are the same (ignoring const) +#define SAME_TYPE(A, B) \ + std::is_same::type, \ + typename std::remove_const::type>::value + +// get C rowmap for sorted input +template +struct SortedCountEntriesRange { + SortedCountEntriesRange(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const AColIndsT& Acolinds_, + const typename BRowPtrsT::const_type& Browptrs_, + const BColIndsT& Bcolinds_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowcounts(Crowcounts_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type numEntries = 0; + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + numEntries++; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + while (Acol == Ccol) + Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); + while (Bcol == Ccol) + Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); + } + Crowcounts(i) = numEntries; + } + + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const AColIndsT Acolinds; + const typename BRowPtrsT::const_type Browptrs; + const BColIndsT Bcolinds; + CRowPtrsT Crowcounts; +}; + +template +struct SortedCountEntriesTeam { + SortedCountEntriesTeam(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const AColIndsT& Acolinds_, + const typename BRowPtrsT::const_type& Browptrs_, + const BColIndsT& Bcolinds_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowcounts(Crowcounts_) {} + + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + + KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type numEntries = 0; + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + numEntries++; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + while (Acol == Ccol) + Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); + while (Bcol == Ccol) + Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); + } + Crowcounts(i) = numEntries; + } + + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { + ordinal_type i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= nrows) return; + ordinal_type* allScratch = + (ordinal_type*)t.team_shmem().get_shmem(totalShared); + ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread; + ordinal_type Arowstart = Arowptrs(i); + ordinal_type Arowlen = Arowptrs(i + 1) - Arowstart; + ordinal_type Browstart = Browptrs(i); + ordinal_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type n = Arowlen + Browlen; + if (n > sharedPerThread) { + // fall back to slow serial method + Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); }); + return; + } + if (n == 0) { + Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; }); + return; + } + // Figure out the number of bitonic steps: ceil(log2(n)) + ordinal_type npot = 1; + ordinal_type levels = 0; + while (npot < n) { + levels++; + npot <<= 1; + } + // Copy A and B entries to scratch + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(t, Arowlen), + [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen), + [&](ordinal_type j) { + scratch[npot - 1 - j] = Bcolinds(Browstart + j); + }); + // Fill space between A and B with ORDINAL_MAX, + // to maintain a valid bitonic sequence of power-of-two length + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) { + scratch[Arowlen + j] = Kokkos::ArithTraits::max(); + }); + // npot = 2^levels + for (ordinal_type level = 0; level < levels; level++) { + // npot/2 pairs of items are compared in parallel + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1), + [&](const ordinal_type j) { + ordinal_type boxSize = npot >> level; + // Which box contains this thread? + // box = (j / boxSize), and boxSize = + // 2^(levels-level), so box = j * 2^(level-levels) + // = j >> (levels - level) + ordinal_type boxID = (j * 2) >> (levels - level); + // boxStart = boxID * boxSize = boxID * + // 2^(levels-level) = boxID << (levels-level) + ordinal_type boxStart = boxID << (levels - level); + ordinal_type boxOffset = j - boxID * boxSize / 2; + ordinal_type elem1 = boxStart + boxOffset; + ordinal_type elem2 = elem1 + (boxSize >> 1); + if (scratch[elem2] < scratch[elem1]) { + ordinal_type temp = scratch[elem1]; + scratch[elem1] = scratch[elem2]; + scratch[elem2] = temp; + } + }); + } + // Finally, count the number of distinct entries (this is #rising edges + 1) + ordinal_type risingEdges; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(t, n - 1), + [&](const ordinal_type j, ordinal_type& lcount) { + if (scratch[j] != scratch[j + 1]) lcount++; + }, + risingEdges); + Kokkos::single(Kokkos::PerThread(t), + [&]() { Crowcounts(i) = risingEdges + 1; }); + } + + size_t team_shmem_size(int teamSize) const { + return sharedPerThread * sizeof(ordinal_type) * teamSize; + } + + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const AColIndsT Acolinds; + const typename BRowPtrsT::const_type Browptrs; + const BColIndsT Bcolinds; + CRowPtrsT Crowcounts; + int sharedPerThread; // Shared for each thread, measured in + // sizeof(ordinal_type) + int totalShared; // Shared for whole team, measured in bytes +}; + +// get upper bound for C entries per row (assumes worst case, that entries in A +// and B on each row are disjoint) +template +struct UnsortedEntriesUpperBound { + UnsortedEntriesUpperBound(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const typename BRowPtrsT::const_type& Browptrs_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowcounts(Crowcounts_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + Crowcounts(i) = + (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i)); + if (i == nrows - 1) { + // last workitem also zeros the one-past-end entry of row counts, so + // that prefix sum is correct + Crowcounts(nrows) = 0; + } + } + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const typename BRowPtrsT::const_type Browptrs; + CRowPtrsT Crowcounts; +}; + +// Unsorted symbolic: new functors: +// -compute uncompressed C (entries only, no values) +// -sort uncompressed C entries within row, while permuting A union B +// permutation array -compress sorted C entries and A,B perm arrays at the same +// time, which produces Crowcounts value +// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C +// entries) Output: C uncompressed colinds +template +struct UnmergedSumFunctor { + UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, + const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_, + const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_, + const CcolindsT& Ccolinds_, const CcolindsT& ABperm_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowptrs(Crowptrs_), + Ccolinds(Ccolinds_), + ABperm(ABperm_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type inserted = 0; + size_type crowstart = Crowptrs(i); + size_type arowstart = Arowptrs(i); + size_type arowlen = Arowptrs(i + 1) - arowstart; + size_type browstart = Browptrs(i); + size_type browlen = Browptrs(i + 1) - browstart; + // Insert all A entries, then all B entries + for (size_type j = 0; j < arowlen; j++) { + Ccolinds(crowstart + inserted) = Acolinds(arowstart + j); + ABperm(crowstart + inserted) = j; + inserted++; + } + for (size_type j = 0; j < browlen; j++) { + Ccolinds(crowstart + inserted) = Bcolinds(browstart + j); + // tell A and B permutation values apart by adding arowlen as a bias to B + // values + ABperm(crowstart + inserted) = j + arowlen; + inserted++; + } + } + ordinal_type nrows; + const ArowptrsT Arowptrs; + const AcolindsT Acolinds; + const BrowptrsT Browptrs; + const BcolindsT Bcolinds; + const CrowptrsT Crowptrs; + CcolindsT Ccolinds; + CcolindsT ABperm; +}; + +template +struct MergeEntriesFunctor { + MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, + const BrowptrsT& Browptrs_, const OffsetView& Crowptrs_, + const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_, + const CcolindsT& ABperm_, const CcolindsT& Apos_, + const CcolindsT& Bpos_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Crowcounts(Crowcounts_), + Ccolinds(Ccolinds_), + ABperm(ABperm_), + Apos(Apos_), + Bpos(Bpos_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type CrowStart = Crowptrs(i); + size_type CrowEnd = Crowptrs(i + 1); + if (CrowEnd == CrowStart) { + Crowcounts(i) = 0; + return; + } + size_type ArowStart = Arowptrs(i); + size_type ArowNum = Arowptrs(i + 1) - ArowStart; + size_type BrowStart = Browptrs(i); + ordinal_type CFit = 0; // counting through merged C indices (within row) + for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) { + if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) { + // This is a different column than the previous entry, and is not the + // first entry. This means that this is the first occurence of a unique + // column. + CFit++; + } + size_type permVal = ABperm(Cit); + if (permVal < ArowNum) { + // Entry belongs to A + ordinal_type Aindex = permVal; + // The Aindex'th entry in row i of A will be added into the CFit'th + // entry in C + Apos(ArowStart + Aindex) = CFit; + } else { + // Entry belongs to B + ordinal_type Bindex = permVal - ArowNum; + // The Bindex'th entry in row i of B will be added into the CFit'th + // entry in C + Bpos(BrowStart + Bindex) = CFit; + } + } + // At end of the row, know how many entries are in merged C. + // Right now, CFit is the index of the last Apos/Bpos, + // so adding one gives the total number of entries. + Crowcounts(i) = CFit + 1; + } + ordinal_type nrows; + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const OffsetView Crowptrs; + CrowptrsT Crowcounts; + CcolindsT Ccolinds; + const CcolindsT ABperm; + CcolindsT Apos; + CcolindsT Bpos; +}; + +// Run SortedCountEntries: non-GPU, always uses the RangePolicy version. +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = + nullptr) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = + typename KernelHandle::SPADDHandleType::execution_space; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + range_type(0, nrows), countEntries); +} + +// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending +// on average nz per row (a runtime decision) +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = + nullptr) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = + typename KernelHandle::SPADDHandleType::execution_space; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; + size_type c_est_nnz = + 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; + if (c_est_nnz <= 512) { + // Convert c_est_nnz to a power of 2 + size_type pot_est_nnz = 1; + while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2; + // Estimate max number of uncompressed entries in each row of C + int vector_length = 1; + int vector_length_max = + KokkosKernels::Impl::kk_get_max_vector_size(); + while (vector_length * 2 <= vector_length_max && + (size_type)vector_length * 2 <= pot_est_nnz) { + vector_length *= 2; + } + SortedCountEntriesTeam + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + countEntries.sharedPerThread = pot_est_nnz; + // compute largest possible team size + TeamPol testPolicy(1, 1, vector_length); + testPolicy.set_scratch_size( + 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + int team_size = testPolicy.team_size_recommended(countEntries, + Kokkos::ParallelForTag()); + // construct real policy + int league_size = (nrows + team_size - 1) / team_size; + TeamPol policy(league_size, team_size, vector_length); + policy.set_scratch_size( + 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + countEntries.totalShared = + countEntries.sharedPerThread * team_size * sizeof(ordinal_type); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy, + countEntries); + } else { + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + RangePol(0, nrows), countEntries); + } +} + +// Symbolic: count entries in each row in C to produce rowmap +// kernel handle has information about whether it is sorted add or not. +template +void spadd_symbolic_impl( + KernelHandle* handle, const alno_row_view_t_ a_rowmap, + const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, + const blno_nnz_view_t_ b_entries, + clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't + // need to be initialized) +{ + typedef + typename KernelHandle::SPADDHandleType::execution_space execution_space; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_lno_t ordinal_type; + typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t; + typedef typename KernelHandle::SPADDHandleType::nnz_row_view_t offset_view_t; + // Check that A/B/C data types match KernelHandle types, and that C data types + // are nonconst (doesn't matter if A/B types are const) + static_assert( + SAME_TYPE(typename alno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: A size_type must match KernelHandle size_type (const " + "doesn't matter)"); + static_assert( + SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: B size_type must match KernelHandle size_type (const " + "doesn't matter)"); + static_assert( + SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: C size_type must match KernelHandle size_type)"); + static_assert(std::is_same::value, + "add_symbolic: C size_type must not be const"); + static_assert( + SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), + "add_symbolic: A entry type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), + "add_symbolic: B entry type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert(std::is_same::value, + "add_symbolic: C entry type must not be const"); + // symbolic just needs to compute c_rowmap + // easy for sorted, but for unsorted is easiest to just compute the whole sum + auto addHandle = handle->get_spadd_handle(); + if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { + // Have 0 rows, so nothing to do except set #nnz to 0 + addHandle->set_c_nnz(0); + // If c_rowmap has a single entry, it must be 0 + if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0); + addHandle->set_call_symbolic(); + return; + } + ordinal_type nrows = a_rowmap.extent(0) - 1; + typedef Kokkos::RangePolicy range_type; + if (addHandle->is_input_sorted()) { + runSortedCountEntries( + a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap); + } else { + // note: scoping individual parts of the process to free views sooner, + // minimizing peak memory usage run the unsorted c_rowmap upper bound + // functor (just adds together A and B entry counts row by row) + offset_view_t c_rowmap_upperbound( + Kokkos::view_alloc(Kokkos::WithoutInitializing, + "C row counts upper bound"), + nrows + 1); + size_type c_nnz_upperbound = 0; + { + UnsortedEntriesUpperBound + countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", + range_type(0, nrows), countEntries); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap_upperbound); + Kokkos::deep_copy(c_nnz_upperbound, + Kokkos::subview(c_rowmap_upperbound, nrows)); + } + ordinal_view_t c_entries_uncompressed( + Kokkos::view_alloc(Kokkos::WithoutInitializing, + "C entries uncompressed"), + c_nnz_upperbound); + ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "A and B permuted entry indices"), + c_nnz_upperbound); + // compute the unmerged sum + UnmergedSumFunctor + unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries, + c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", + range_type(0, nrows), unmergedSum); + // sort the unmerged sum + KokkosKernels::sort_crs_matrix( + c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + ordinal_view_t a_pos( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), + a_entries.extent(0)); + ordinal_view_t b_pos( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), + b_entries.extent(0)); + // merge the entries and compute Apos/Bpos, as well as Crowcounts + { + MergeEntriesFunctor + mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap, + c_entries_uncompressed, ab_perm, a_pos, b_pos); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", + range_type(0, nrows), mergeEntries); + // compute actual c_rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap); + } + addHandle->set_a_b_pos(a_pos, b_pos); + } + // provide the number of NNZ in C to user through handle + size_type cmax; + Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); + addHandle->set_c_nnz(cmax); + addHandle->set_call_symbolic(); + addHandle->set_call_numeric(false); +} + +#undef SAME_TYPE + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp new file mode 100644 index 0000000000..965f4d954c --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -0,0 +1,189 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosSparse_spadd_symbolic_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spadd_symbolic_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_symbolic_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) + +template ::value, + bool eti_spec_avail = spadd_symbolic_eti_spec_avail< + KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t, + b_lno_view_t, c_size_view_t>::value> +struct SPADD_SYMBOLIC { + static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, + a_lno_view_t entriesA, b_size_view_t row_mapB, + b_lno_view_t entriesB, c_size_view_t row_mapC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct SPADD_SYMBOLIC { + static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, + a_lno_view_t entriesA, b_size_view_t row_mapB, + b_lno_view_t entriesB, c_size_view_t row_mapC) { + spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB, + row_mapC); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPADD_SYMBOLIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPADD_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include +#include + +#endif From 2deefeb4c03dc22111805b8ffa5b6814dd16f443 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 6 May 2022 12:17:18 -0600 Subject: [PATCH 128/261] SpAdd ETI: change Handle to use const values (to match other sparse kernels) --- src/common/KokkosKernels_Handle.hpp | 1 + src/sparse/KokkosSparse_spadd.hpp | 54 +++++++++++++------ .../impl/KokkosSparse_spadd_numeric_spec.hpp | 12 ++--- .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 12 ++--- 4 files changed, 51 insertions(+), 28 deletions(-) diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 0e9ba8dc4e..69a74c3e5d 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -181,6 +181,7 @@ class KokkosKernelsHandle { this->gs_sptrsvUHandle = right_side_handle.get_gs_sptrsvU_handle(); this->spgemmHandle = right_side_handle.get_spgemm_handle(); + this->spaddHandle = right_side_handle.get_spadd_handle(); this->sptrsvHandle = right_side_handle.get_sptrsv_handle(); this->spilukHandle = right_side_handle.get_spiluk_handle(); diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index fbc2e0c595..38bead14de 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -68,8 +68,18 @@ void spadd_symbolic( { typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + typedef Kokkos::View::array_layout, @@ -95,10 +105,10 @@ void spadd_symbolic( clno_row_view_t_>::array_layout, DeviceType, Kokkos::MemoryTraits > Internal_c_rowmap; - KokkosSparse::Impl::SPADD_SYMBOLIC:: - spadd_symbolic(handle, + spadd_symbolic(&tmp_handle, Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), Internal_a_entries(a_entries.data(), a_entries.extent(0)), Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), @@ -122,8 +132,18 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, cscalar_nnz_view_t_ c_values) { typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + typedef Kokkos::View::array_layout, @@ -169,20 +189,22 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, cscalar_nnz_view_t_>::array_layout, DeviceType, Kokkos::MemoryTraits > Internal_c_values; - KokkosSparse::Impl::SPADD_NUMERIC< - KernelHandle, Internal_a_rowmap, Internal_a_entries, Internal_a_values, - Internal_b_rowmap, Internal_b_entries, Internal_b_values, - Internal_c_rowmap, Internal_c_entries, Internal_c_values>:: - spadd_numeric( - handle, alpha, Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_a_values(a_values.data(), a_values.extent(0)), beta, - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_b_values(b_values.data(), b_values.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), - Internal_c_entries(c_entries.data(), c_entries.extent(0)), - Internal_c_values(c_values.data(), c_values.extent(0))); + KokkosSparse::Impl::SPADD_NUMERIC:: + spadd_numeric(&tmp_handle, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); } } // namespace Experimental diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index b1d5f6a04a..7cc93e2715 100644 --- a/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -73,8 +73,8 @@ struct spadd_numeric_eti_spec_avail { template <> \ struct spadd_numeric_eti_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ - OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -171,8 +171,8 @@ struct SPADD_NUMERIC, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -207,8 +207,8 @@ struct SPADD_NUMERIC, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index 965f4d954c..7a48999e6a 100644 --- a/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -71,8 +71,8 @@ struct spadd_symbolic_eti_spec_avail { template <> \ struct spadd_symbolic_eti_spec_avail< \ KokkosKernels::Experimental::KokkosKernelsHandle< \ - OFFSET_TYPE, ORDINAL_TYPE, SCALAR_TYPE, EXEC_SPACE_TYPE, \ - MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -140,8 +140,8 @@ struct SPADD_SYMBOLIC, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -164,8 +164,8 @@ struct SPADD_SYMBOLIC, \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ From dcb6953710c09b705effd1c0f5d3d778317b3dd0 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 6 May 2022 12:05:26 -0600 Subject: [PATCH 129/261] Add ETI for D1 coloring --- src/CMakeLists.txt | 7 + src/graph/KokkosGraph_Distance1Color.hpp | 108 ++++--------- .../impl/KokkosGraph_Distance1Color_impl.hpp | 82 ++++++++++ src/graph/impl/KokkosGraph_color_d1_spec.hpp | 153 ++++++++++++++++++ .../KokkosGraph_color_d1_eti_spec_inst.cpp.in | 53 ++++++ ...KokkosGraph_color_d1_eti_spec_avail.hpp.in | 51 ++++++ .../KokkosGraph_color_d1_eti_spec_decl.hpp.in | 51 ++++++ 7 files changed, 428 insertions(+), 77 deletions(-) create mode 100644 src/graph/impl/KokkosGraph_color_d1_spec.hpp create mode 100644 src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ef591da4b3..8fd0bc21b8 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -439,6 +439,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +KOKKOSKERNELS_GENERATE_ETI(Graph_color_d1 color_d1 + COMPONENTS graph + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) #Add a few other utility files diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp index 3001ea660c..aca6414c83 100644 --- a/src/graph/KokkosGraph_Distance1Color.hpp +++ b/src/graph/KokkosGraph_Distance1Color.hpp @@ -44,8 +44,8 @@ #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP -#include "KokkosGraph_Distance1ColorHandle.hpp" -#include "KokkosGraph_Distance1Color_impl.hpp" +#include "KokkosGraph_color_d1_spec.hpp" +#include "KokkosKernels_helpers.hpp" #include "KokkosKernels_Utils.hpp" namespace KokkosGraph { @@ -59,81 +59,35 @@ void graph_color_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t /* num_cols */, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool /* is_symmetric */ = true) { - Kokkos::Timer timer; - - typename KernelHandle::GraphColoringHandleType *gch = - handle->get_graph_coloring_handle(); - - ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); - - typedef typename KernelHandle::GraphColoringHandleType::color_view_t - color_view_type; - - gch->set_tictoc(handle->get_verbose()); - - color_view_type colors_out; - if (gch->get_vertex_colors().use_count() > 0) { - colors_out = gch->get_vertex_colors(); - } else { - colors_out = color_view_type("Graph Colors", num_rows); - } - - typedef - typename Impl::GraphColor - BaseGraphColoring; - BaseGraphColoring *gc = NULL; - - switch (algorithm) { - case COLORING_SERIAL: - gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_VB: - case COLORING_VBBIT: - case COLORING_VBCS: - typedef typename Impl::GraphColor_VB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBGraphColoring; - gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_VBD: - case COLORING_VBDBIT: - typedef typename Impl::GraphColor_VBD< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBDGraphColoring; - gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_EB: - typedef typename Impl::GraphColor_EB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - EBGraphColoring; - gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_DEFAULT: break; - - default: break; - } - - int num_phases = 0; - gc->color_graph(colors_out, num_phases); - - delete gc; - double coloring_time = timer.seconds(); - gch->add_to_overall_coloring_time(coloring_time); - gch->set_coloring_time(coloring_time); - gch->set_num_phases(num_phases); - gch->set_vertex_colors(colors_out); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_entries; + KokkosGraph::Impl:: + COLOR_D1::color_d1( + &tmp_handle, num_rows, + Internal_rowmap(row_map.data(), row_map.extent(0)), + Internal_entries(entries.data(), entries.extent(0))); } template +void graph_color_impl(KernelHandle *handle, + typename KernelHandle::nnz_lno_t num_rows, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries) { + Kokkos::Timer timer; + + typename KernelHandle::GraphColoringHandleType *gch = + handle->get_graph_coloring_handle(); + + ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); + + typedef typename KernelHandle::GraphColoringHandleType::color_view_t + color_view_type; + + gch->set_tictoc(handle->get_verbose()); + + color_view_type colors_out; + if (gch->get_vertex_colors().use_count() > 0) { + colors_out = gch->get_vertex_colors(); + } else { + colors_out = color_view_type("Graph Colors", num_rows); + } + + typedef + typename Impl::GraphColor + BaseGraphColoring; + BaseGraphColoring *gc = NULL; + + switch (algorithm) { + case COLORING_SERIAL: + gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_VB: + case COLORING_VBBIT: + case COLORING_VBCS: + typedef typename Impl::GraphColor_VB< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + VBGraphColoring; + gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_VBD: + case COLORING_VBDBIT: + typedef typename Impl::GraphColor_VBD< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + VBDGraphColoring; + gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_EB: + typedef typename Impl::GraphColor_EB< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + EBGraphColoring; + gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_DEFAULT: break; + + default: break; + } + + int num_phases = 0; + gc->color_graph(colors_out, num_phases); + + delete gc; + double coloring_time = timer.seconds(); + gch->add_to_overall_coloring_time(coloring_time); + gch->set_coloring_time(coloring_time); + gch->set_num_phases(num_phases); + gch->set_vertex_colors(colors_out); +} + } // namespace Impl } // namespace KokkosGraph diff --git a/src/graph/impl/KokkosGraph_color_d1_spec.hpp b/src/graph/impl/KokkosGraph_color_d1_spec.hpp new file mode 100644 index 0000000000..67cd09a099 --- /dev/null +++ b/src/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -0,0 +1,153 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosGraph_Distance1Color_impl.hpp" +#endif + +namespace KokkosGraph { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct color_d1_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosGraph + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template <> \ + struct color_d1_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include + +namespace KokkosGraph { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosGraph::graph_color (distance-1 greedy +/// coloring) + +template ::value> +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, + typename lno_view_t::non_const_value_type num_rows, + size_view_t rowmap, lno_view_t entries); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, + typename lno_view_t::non_const_value_type num_rows, + size_view_t rowmap, lno_view_t entries) { + KokkosGraph::Impl::graph_color_impl(handle, num_rows, rowmap, entries); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosGraph + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + extern template struct COLOR_D1< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + template struct COLOR_D1< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#include + +#endif diff --git a/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..c4e4c8efe6 --- /dev/null +++ b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosGraph_color_d1_spec.hpp" +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..7b9b69063c --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..fc47564161 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif From 3f64eda59b9236f3b1e7e5aa424408838c9462a4 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 10 May 2022 15:03:36 -0600 Subject: [PATCH 130/261] Initial hash map spiluk numeric impl --- src/sparse/KokkosSparse_spiluk_handle.hpp | 40 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 452 +++++++++++++----- .../KokkosSparse_spiluk_symbolic_impl.hpp | 156 +++++- 3 files changed, 539 insertions(+), 109 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 3cabcd0f73..f7112c61dc 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -58,7 +58,8 @@ namespace Experimental { // TP2 algorithm has issues with some offset-ordinal combo to be addressed enum class SPILUKAlgorithm { SEQLVLSCHD_RP, - SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/ + SEQLVLSCHD_TP1, /*, SEQLVLSCHED_TP2*/ + SEQLVLSCHD_TP1HASHMAP }; template nnz_lno_view_t; + typedef typename Kokkos::View + nnz_row_view_host_t; + typedef typename std::make_signed< typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; typedef Kokkos::View +struct ILUKLvlSchedTP1HashMapNumericFunctor +{ + using execution_space = typename ARowMapType::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using size_type = typename ARowMapType::non_const_value_type; + using scalar_t = typename AValuesType::non_const_value_type ; + using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator + ; + + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; + LevelViewType level_idx; + nnz_lno_t lev_start; + nnz_lno_t shmem_hash_size; + nnz_lno_t shmem_key_size; + nnz_lno_t shared_memory_hash_func; + nnz_lno_t shmem_size; + + ILUKLvlSchedTP1HashMapNumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, + const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, UValuesType &U_values_, + const LevelViewType &level_idx_, const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, + const nnz_lno_t &shmem_key_size_, const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) : + A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), + L_row_map(L_row_map_), L_entries(L_entries_), L_values(L_values_), + U_row_map(U_row_map_), U_entries(U_entries_), U_values(U_values_), + level_idx(level_idx_), lev_start(lev_start_), shmem_hash_size(shmem_hash_size_), + shmem_key_size(shmem_key_size_), shared_memory_hash_func(shared_memory_hash_func_), + shmem_size(shmem_size_) {} + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + auto my_league = team.league_rank(); // teamid + auto rowid = level_idx(my_league + lev_start);//teamid-->rowid + //auto my_team = team.team_rank(); + + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // printf("BEFORE CREATE HASH MAP\n"); + //}); + + //START shared hash map initialization + char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); + + // Threads in a team share 4 arrays: begin, next, keys, values + // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd level hash right now) + volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + //points to begin array + nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; + + // points to the next elements + nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + + // holds the keys and vals + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory); + + hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); + + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // printf("BEFORE INIT\n"); + //}); + + // initialize begins + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), [&](int i) { + begins[i] = -1; + }); + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerTeam(team), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + }); + + team.team_barrier(); + //Shared hash map initialization DONE + + Kokkos::single(Kokkos::PerTeam(team),[&] () { + printf("TEST BEFORE INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); + }); + + auto k1 = L_row_map(rowid); + auto k2 = L_row_map(rowid+1); +#ifdef KEEP_DIAG + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2-1 ), [&] ( const nnz_lno_t k ) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); + }); +#else + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); + }); +#endif + +#ifdef KEEP_DIAG + //if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team),[&] () { L_values(k2-1) = scalar_t(1.0); }); +#endif + + team.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(team),[&] () { + printf("TEST AFTER INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); + }); + + k1 = U_row_map(rowid); + k2 = U_row_map(rowid+1); + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); + }); + + team.team_barrier(); + + //Unpack the ith row of A + k1 = A_row_map(rowid); + k2 = A_row_map(rowid+1); + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t hashmap_idx = hm.find(col); + if (hashmap_idx != -1) { + nnz_lno_t ipos = hm.values[hashmap_idx]; + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + } + }); + + team.team_barrier(); + + //Eliminate prev rows + k1 = L_row_map(rowid); + k2 = L_row_map(rowid+1); +#ifdef KEEP_DIAG + for (auto k = k1; k < k2-1; ++k) +#else + for (auto k = k1; k < k2; ++k) +#endif + { + auto prev_row = L_entries(k); +#ifdef KEEP_DIAG + auto fact = L_values(k) / U_values(U_row_map(prev_row)); +#else + auto fact = L_values(k) * U_values(U_row_map(prev_row)); +#endif + //if ( my_team == 0 ) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team),[&] () { L_values(k) = fact; }); + + team.team_barrier(); + + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, U_row_map(prev_row)+1, U_row_map(prev_row+1) ), [&] ( const size_type kk ) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t hashmap_idx = hm.find(col); + if (hashmap_idx != -1) { + nnz_lno_t ipos = hm.values[hashmap_idx]; + auto lxu = -U_values(kk) * fact; + if (col < rowid) + //L_values(ipos) += lxu; + Kokkos::atomic_add (&L_values(ipos), lxu); + else + //U_values(ipos) += lxu; + Kokkos::atomic_add (&U_values(ipos), lxu); + } + });// end for kk + + team.team_barrier(); + }// end for k + + //if ( my_team == 0 ) { + Kokkos::single(Kokkos::PerTeam(team),[&] () { + nnz_lno_t hashmap_idx = hm.find(rowid); + if (hashmap_idx != -1) { + nnz_lno_t ipos = hm.values[hashmap_idx]; +#ifdef KEEP_DIAG + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; + } +#else + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; + } + else { + U_values(ipos) = 1.0 / U_values(ipos); + } +#endif + } + }); + //} + //Note: Reseting the hash table umap is done outside the kernel + } + + nnz_lno_t team_shmem_size(int /* team_size */) const { + return shmem_size; + } +}; + template (lev_start, lev_end), - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( - A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { + auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); + auto level_shmem_key_size = thandle.get_level_shmem_key_size(); + + for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { + nnz_lno_t lev_start = level_ptr_h(lvl); + nnz_lno_t lev_end = level_ptr_h(lvl+1); + + if ( (lev_end - lev_start) != 0 ) { using policy_type = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); - - nnz_lno_t lvl_rowid_start = 0; - nnz_lno_t lvl_nrows_chunk; - for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { - if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > - (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - - ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, iw, - lev_start + lvl_rowid_start); - - if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, team_size), tstf); - - lvl_rowid_start += lvl_nrows_chunk; + using scratch_space = typename execution_space::scratch_memory_space; + using view_type_1d_scratch = Kokkos::View; + + nnz_lno_t shmem_hash_size = static_cast(level_shmem_hash_size(lvl)); + nnz_lno_t shmem_key_size = static_cast(level_shmem_key_size(lvl)); + + nnz_lno_t shared_memory_hash_func = shmem_hash_size - 1;//for AND operation we use -1 + + //shmem needs the first 2 entries for sizes + //nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); + nnz_lno_t shmem_size = view_type_1d_scratch::shmem_size(2 + shmem_hash_size + shmem_key_size * 3); + + printf("lvl %d, shmem_hash_size %d, shmem_key_size %d, shmem_size %d\n",lvl, shmem_hash_size, shmem_key_size, shmem_size); + + int team_size = thandle.get_team_size(); + ILUKLvlSchedTP1HashMapNumericFunctor tstf(A_row_map, A_entries, A_values, + L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, + level_idx, lev_start, + shmem_hash_size, shmem_key_size, + shared_memory_hash_func, shmem_size); + if ( team_size == -1 ) + Kokkos::parallel_for("parfor_l_team", policy_type( lev_end - lev_start , Kokkos::AUTO ), tstf); + else + Kokkos::parallel_for("parfor_l_team", policy_type( lev_end - lev_start , team_size ), tstf); + } // end if + } // end for lvl + }//End SEQLVLSCHD_TP1HASHMAP + else { + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_nchunks_h = LevelHostViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), + level_nchunks.extent(0)); + level_nrowsperchunk_h = + LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "Host level nrowsperchunk"), + level_nrowsperchunk.extent(0)); + Kokkos::deep_copy(level_nchunks_h, level_nchunks); + Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrowsperchunk(), nrows); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } else { + iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + thandle.get_level_maxrows(), nrows); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } + + // Main loop must be performed sequential. Question: Try out Cuda's graph + // stuff to reduce kernel launch overhead + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + nnz_lno_t lev_start = level_ptr_h(lvl); + nnz_lno_t lev_end = level_ptr_h(lvl + 1); + + if ((lev_end - lev_start) != 0) { + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy(lev_start, lev_end), + ILUKLvlSchedRPNumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, + LEntriesType, LValuesType, URowMapType, UEntriesType, + UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( + A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, level_idx, iw, lev_start)); + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); + + nnz_lno_t lvl_rowid_start = 0; + nnz_lno_t lvl_nrows_chunk; + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, level_idx, iw, + lev_start + lvl_rowid_start); + + if (team_size == -1) + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, Kokkos::AUTO), + tstf); + else + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, team_size), tstf); + + lvl_rowid_start += lvl_nrows_chunk; + } } - } - // /* - // // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed else if ( thandle.get_algorithm() == - // KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { - // typedef Kokkos::TeamPolicy tvt_policy_type; - // - // int team_size = thandle.get_team_size(); - // if ( team_size == -1 ) { - // team_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 128; - // } - // int vector_size = thandle.get_team_size(); - // if ( vector_size == -1 ) { - // vector_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 4; - // } - // - // // This impl: "chunk" lvl_nodes into node_groups; a league_rank - // is responsible for processing that many nodes - // // TeamThreadRange over number of node_groups - // // To avoid masking threads, 1 thread (team) per node in - // node_group - // // ThreadVectorRange responsible for the actual solve - // computation const int node_groups = team_size; - // - // LowerTriLvlSchedTP2SolverFunctor - // tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - // row_count, node_groups); - // Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( - // (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, - // vector_size ), tstf); - // } // end elseif - // */ - - } // end if - } // end for lvl + } // end if + } // end for lvl + } // Output check #ifdef NUMERIC_OUTPUT_INFO diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 90bb88e057..7e1d063aa5 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -219,6 +219,154 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, level_nrowsperchunk = lnrowsperchunk; } +template +void level_sched ( IlukHandle& thandle, + const LRowMapType L_row_map, const LEntriesType L_entries, + const URowMapType U_row_map, const UEntriesType U_entries, + LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type &nlevels ) { + // Scheduling currently compute on host + + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + + size_type nrows = thandle.get_nrows(); + + nlevels = 0; + level_ptr(0) = 0; + + for ( size_type i = 0; i < nrows; ++i ) { + size_type l = 0; + size_type rowstart= L_row_map(i); + size_type rowend = L_row_map(i+1); + for ( size_type j = rowstart; j < rowend; ++j ) { + nnz_lno_t col = L_entries(j); + l = std::max(l, level_list(col)); + } + level_list(i) = l+1; + level_ptr(l+1) += 1; + nlevels = std::max(nlevels, l+1); + } + + for ( size_type i = 1; i <= nlevels; ++i ) { + level_ptr(i) += level_ptr(i-1); + } + + for ( size_type i = 0; i < nrows; i++ ) { + level_idx(level_ptr(level_list(i)-1)) = i; + level_ptr(level_list(i)-1) += 1; + } + + if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0. + for ( size_type i = nlevels-1; i > 0; --i ) { + level_ptr(i) = level_ptr(i-1); + } + } + + level_ptr(0) = 0; + + //Find the maximum number of nnz per row per level + //Determine shmem hash size and key size + //(max. number of non-zeros in both L and U) + size_type maxrows = 0; + + //TEST + size_type max_maxnnzperrow = 0; + size_type max_shmem_hash_size = 0; + size_type max_shmem_key_size = 0; + size_type min_maxnnzperrow = 2000000000; + size_type min_shmem_hash_size = 2000000000; + size_type min_shmem_key_size = 2000000000; + + thandle.alloc_level_maxnnzperrow(nlevels); + thandle.alloc_level_shmem_hash_size(nlevels); + thandle.alloc_level_shmem_key_size(nlevels); + + auto level_maxnnzperrow = thandle.get_level_maxnnzperrow(); + auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); + auto level_shmem_key_size = thandle.get_level_shmem_key_size(); + + for ( size_type i = 0; i < nlevels; i++ ) { + size_type lnrows = level_ptr(i+1) - level_ptr(i); + if( maxrows < lnrows ) { + maxrows = lnrows; + } + //Determine the number of non-zeros in each level + size_type rid_s = level_ptr(i); + size_type rid_e = level_ptr(i+1); + size_type lnnz = 0; + size_type lmaxnnz = 0; + for (size_type rid = rid_s; rid < rid_e; rid++) {//Look at each row in a level + size_type rnnz = (L_row_map(rid+1) - L_row_map(rid)) + + (U_row_map(rid+1) - U_row_map(rid));//count the number of non-zeros in the current row (both L and U) + lnnz += rnnz;//accumulate to count the nnz in the current level + if( lmaxnnz < rnnz ) { + lmaxnnz = rnnz; + } + } + level_maxnnzperrow(i) = lmaxnnz; + + size_type shmem_key_size = lmaxnnz;//the number of keys can a team (row) hold + + // put the hash size closest power of 2. + // we round down here, because we want to store more keys, + // conflicts are cheaper. + size_type shmem_hash_size = 1; + while (shmem_hash_size * 2 <= shmem_key_size) { + shmem_hash_size = shmem_hash_size * 2; + } + + // increase the key size with the left over from hash size. + shmem_key_size = shmem_key_size + (shmem_key_size - shmem_hash_size) / 3; //note: divided by 3 because nexts, keys, values have sizes of shmem_key_size + // round it down to 2, because of some alignment issues. + shmem_key_size = (shmem_key_size >> 1) << 1; + + level_shmem_hash_size(i) = shmem_hash_size; + level_shmem_key_size(i) = shmem_key_size; + + if ((i < 20)|| (i >= (nlevels-20))) { + std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) << " rows"; + std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); + std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); + std::cout << ", shmem_key_size: " << level_shmem_key_size(i); + std::cout << std::endl; + } + + if( max_maxnnzperrow < level_maxnnzperrow(i) ) { + max_maxnnzperrow = level_maxnnzperrow(i); + } + if( min_maxnnzperrow > level_maxnnzperrow(i) ) { + min_maxnnzperrow = level_maxnnzperrow(i); + } + if( max_shmem_hash_size < level_shmem_hash_size(i) ) { + max_shmem_hash_size = level_shmem_hash_size(i); + } + if( min_shmem_hash_size > level_shmem_hash_size(i) ) { + min_shmem_hash_size = level_shmem_hash_size(i); + } + if( max_shmem_key_size < level_shmem_key_size(i) ) { + max_shmem_key_size = level_shmem_key_size(i); + } + if( min_shmem_key_size > level_shmem_key_size(i) ) { + min_shmem_key_size = level_shmem_key_size(i); + } + } + + std::cout << " VINH TEST: spiluk_symbolic() -- " << ", unordered map capacity among levels: " << umapcapacity + << ", maxnnzperrow (max " << max_maxnnzperrow << ", min "<< min_maxnnzperrow << ")" + << ", shmem_hash_size (max " << max_shmem_hash_size << ", min "<< min_shmem_hash_size << ")" + << ", shmem_key_size (max " << max_shmem_key_size << ", min "<< min_shmem_key_size << ")" << std::endl; + + thandle.set_num_levels(nlevels); + thandle.set_level_maxrows(maxrows); + +} + // Linear Search for the smallest row index template size_type search_col_index(nnz_lno_t j, size_type lenl, ViewType h_iL, @@ -261,7 +409,9 @@ void iluk_symbolic(IlukHandle& thandle, if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 || + thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) /* || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/ { @@ -471,6 +621,10 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { + level_sched (thandle, L_row_map, L_entries, U_row_map, U_entries, + level_list, level_ptr, level_idx, nlev); + } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, level_nchunks, level_nrowsperchunk, nlev); From 14a6991dcf430949f5d65d2fd9fbefbfa751464c Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 10 May 2022 23:14:01 -0700 Subject: [PATCH 131/261] Update implementation --- src/sparse/KokkosSparse_spiluk_handle.hpp | 1 + .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 45 +++++++----- .../KokkosSparse_spiluk_symbolic_impl.hpp | 73 ++++++------------- 3 files changed, 50 insertions(+), 69 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index f7112c61dc..fc15b6f4a7 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -45,6 +45,7 @@ #include #include #include +#include #ifndef _SPILUKHANDLE_HPP #define _SPILUKHANDLE_HPP diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 758d0a2622..c0d08919ea 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -425,7 +425,7 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor //auto my_team = team.team_rank(); //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("BEFORE CREATE HASH MAP\n"); + // printf("BEFORE CREATE HASH MAP: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); //}); //START shared hash map initialization @@ -452,7 +452,7 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("BEFORE INIT\n"); + // printf("BEFORE INIT HASH MAP: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); //}); // initialize begins @@ -460,6 +460,11 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor begins[i] = -1; }); + + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // printf("AFTER INIT BEGINS: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); + //}); + // initialize hash usage sizes Kokkos::single(Kokkos::PerTeam(team), [&]() { used_hash_sizes[0] = 0; @@ -469,9 +474,9 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor team.team_barrier(); //Shared hash map initialization DONE - Kokkos::single(Kokkos::PerTeam(team),[&] () { - printf("TEST BEFORE INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); - }); + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // printf("TEST BEFORE INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); + //}); auto k1 = L_row_map(rowid); auto k2 = L_row_map(rowid+1); @@ -496,9 +501,9 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor team.team_barrier(); - Kokkos::single(Kokkos::PerTeam(team),[&] () { - printf("TEST AFTER INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); - }); + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // printf("TEST AFTER INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); + //}); k1 = U_row_map(rowid); k2 = U_row_map(rowid+1); @@ -644,8 +649,8 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, if ( (lev_end - lev_start) != 0 ) { using policy_type = Kokkos::TeamPolicy; - using scratch_space = typename execution_space::scratch_memory_space; - using view_type_1d_scratch = Kokkos::View; + ////using scratch_space = typename execution_space::scratch_memory_space; + ////using view_type_1d_scratch = Kokkos::View; nnz_lno_t shmem_hash_size = static_cast(level_shmem_hash_size(lvl)); nnz_lno_t shmem_key_size = static_cast(level_shmem_key_size(lvl)); @@ -653,10 +658,10 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, nnz_lno_t shared_memory_hash_func = shmem_hash_size - 1;//for AND operation we use -1 //shmem needs the first 2 entries for sizes - //nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); - nnz_lno_t shmem_size = view_type_1d_scratch::shmem_size(2 + shmem_hash_size + shmem_key_size * 3); + nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); + ////nnz_lno_t shmem_size = view_type_1d_scratch::shmem_size(2 + shmem_hash_size + shmem_key_size * 3); - printf("lvl %d, shmem_hash_size %d, shmem_key_size %d, shmem_size %d\n",lvl, shmem_hash_size, shmem_key_size, shmem_size); + //printf("lvl %d, shmem_hash_size %d, shmem_key_size %d, shmem_size %d, shmem_size_ %d, scratch_space %s\n",lvl, shmem_hash_size, shmem_key_size, shmem_size, shmem_size_, typeid(scratch_space).name()); int team_size = thandle.get_team_size(); ILUKLvlSchedTP1HashMapNumericFunctor -void level_sched ( IlukHandle& thandle, - const LRowMapType L_row_map, const LEntriesType L_entries, - const URowMapType U_row_map, const UEntriesType U_entries, - LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type &nlevels ) { +void level_sched_hashmap ( IlukHandle& thandle, + const LRowMapType L_row_map, const LEntriesType L_entries, + const URowMapType U_row_map, const UEntriesType U_entries, + LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type &nlevels ) { // Scheduling currently compute on host using nnz_lno_t = typename IlukHandle::nnz_lno_t; @@ -252,45 +252,37 @@ void level_sched ( IlukHandle& thandle, level_ptr(l+1) += 1; nlevels = std::max(nlevels, l+1); } - + for ( size_type i = 1; i <= nlevels; ++i ) { level_ptr(i) += level_ptr(i-1); } - + for ( size_type i = 0; i < nrows; i++ ) { level_idx(level_ptr(level_list(i)-1)) = i; level_ptr(level_list(i)-1) += 1; } - + if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0. for ( size_type i = nlevels-1; i > 0; --i ) { level_ptr(i) = level_ptr(i-1); } } - + level_ptr(0) = 0; - + //Find the maximum number of nnz per row per level //Determine shmem hash size and key size //(max. number of non-zeros in both L and U) size_type maxrows = 0; - - //TEST - size_type max_maxnnzperrow = 0; - size_type max_shmem_hash_size = 0; - size_type max_shmem_key_size = 0; - size_type min_maxnnzperrow = 2000000000; - size_type min_shmem_hash_size = 2000000000; - size_type min_shmem_key_size = 2000000000; - + thandle.alloc_level_maxnnzperrow(nlevels); thandle.alloc_level_shmem_hash_size(nlevels); thandle.alloc_level_shmem_key_size(nlevels); - + auto level_maxnnzperrow = thandle.get_level_maxnnzperrow(); auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); auto level_shmem_key_size = thandle.get_level_shmem_key_size(); - + for ( size_type i = 0; i < nlevels; i++ ) { size_type lnrows = level_ptr(i+1) - level_ptr(i); if( maxrows < lnrows ) { @@ -310,9 +302,9 @@ void level_sched ( IlukHandle& thandle, } } level_maxnnzperrow(i) = lmaxnnz; - - size_type shmem_key_size = lmaxnnz;//the number of keys can a team (row) hold - + + size_type shmem_key_size = 3*lmaxnnz;//the number of keys can a team (row) hold + // put the hash size closest power of 2. // we round down here, because we want to store more keys, // conflicts are cheaper. @@ -320,12 +312,12 @@ void level_sched ( IlukHandle& thandle, while (shmem_hash_size * 2 <= shmem_key_size) { shmem_hash_size = shmem_hash_size * 2; } - + // increase the key size with the left over from hash size. shmem_key_size = shmem_key_size + (shmem_key_size - shmem_hash_size) / 3; //note: divided by 3 because nexts, keys, values have sizes of shmem_key_size // round it down to 2, because of some alignment issues. shmem_key_size = (shmem_key_size >> 1) << 1; - + level_shmem_hash_size(i) = shmem_hash_size; level_shmem_key_size(i) = shmem_key_size; @@ -334,37 +326,14 @@ void level_sched ( IlukHandle& thandle, std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); std::cout << ", shmem_key_size: " << level_shmem_key_size(i); + std::cout << ", shared_memory_hash_func: " << level_shmem_hash_size(i)-1; + std::cout << ", shmem_size: " << (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); std::cout << std::endl; } - - if( max_maxnnzperrow < level_maxnnzperrow(i) ) { - max_maxnnzperrow = level_maxnnzperrow(i); - } - if( min_maxnnzperrow > level_maxnnzperrow(i) ) { - min_maxnnzperrow = level_maxnnzperrow(i); - } - if( max_shmem_hash_size < level_shmem_hash_size(i) ) { - max_shmem_hash_size = level_shmem_hash_size(i); - } - if( min_shmem_hash_size > level_shmem_hash_size(i) ) { - min_shmem_hash_size = level_shmem_hash_size(i); - } - if( max_shmem_key_size < level_shmem_key_size(i) ) { - max_shmem_key_size = level_shmem_key_size(i); - } - if( min_shmem_key_size > level_shmem_key_size(i) ) { - min_shmem_key_size = level_shmem_key_size(i); - } } - std::cout << " VINH TEST: spiluk_symbolic() -- " << ", unordered map capacity among levels: " << umapcapacity - << ", maxnnzperrow (max " << max_maxnnzperrow << ", min "<< min_maxnnzperrow << ")" - << ", shmem_hash_size (max " << max_shmem_hash_size << ", min "<< min_shmem_hash_size << ")" - << ", shmem_key_size (max " << max_shmem_key_size << ", min "<< min_shmem_key_size << ")" << std::endl; - thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); - } // Linear Search for the smallest row index @@ -622,8 +591,8 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { - level_sched (thandle, L_row_map, L_entries, U_row_map, U_entries, - level_list, level_ptr, level_idx, nlev); + level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, + level_list, level_ptr, level_idx, nlev); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, From 33523f96384eadd14fc06696e4192d5684c76dcf Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 10 May 2022 23:21:00 -0700 Subject: [PATCH 132/261] Delete comments --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 24 ------------------- 1 file changed, 24 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index c0d08919ea..8148730f65 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -424,10 +424,6 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor auto rowid = level_idx(my_league + lev_start);//teamid-->rowid //auto my_team = team.team_rank(); - //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("BEFORE CREATE HASH MAP: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); - //}); - //START shared hash map initialization char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); @@ -451,19 +447,10 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); - //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("BEFORE INIT HASH MAP: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); - //}); - // initialize begins Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), [&](int i) { begins[i] = -1; }); - - - //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("AFTER INIT BEGINS: team %d/%d, thread %d/%d\n", team.league_rank(), team.league_size(), team.team_rank(), team.team_size()); - //}); // initialize hash usage sizes Kokkos::single(Kokkos::PerTeam(team), [&]() { @@ -474,10 +461,6 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor team.team_barrier(); //Shared hash map initialization DONE - //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("TEST BEFORE INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); - //}); - auto k1 = L_row_map(rowid); auto k2 = L_row_map(rowid+1); #ifdef KEEP_DIAG @@ -501,10 +484,6 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor team.team_barrier(); - //Kokkos::single(Kokkos::PerTeam(team),[&] () { - // printf("TEST AFTER INSERT HASH used_hash_sizes %d\n",used_hash_sizes[0]); - //}); - k1 = U_row_map(rowid); k2 = U_row_map(rowid+1); Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { @@ -590,7 +569,6 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor } }); //} - //Note: Reseting the hash table umap is done outside the kernel } nnz_lno_t team_shmem_size(int /* team_size */) const { @@ -661,8 +639,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); ////nnz_lno_t shmem_size = view_type_1d_scratch::shmem_size(2 + shmem_hash_size + shmem_key_size * 3); - //printf("lvl %d, shmem_hash_size %d, shmem_key_size %d, shmem_size %d, shmem_size_ %d, scratch_space %s\n",lvl, shmem_hash_size, shmem_key_size, shmem_size, shmem_size_, typeid(scratch_space).name()); - int team_size = thandle.get_team_size(); ILUKLvlSchedTP1HashMapNumericFunctor Date: Wed, 11 May 2022 11:12:39 -0600 Subject: [PATCH 133/261] Fix colliding include guards (copy-paste mistake) (fix #1407) --- .../KokkosGraph_color_d1_eti_spec_avail.hpp.in | 4 ++-- .../KokkosGraph_color_d1_eti_spec_decl.hpp.in | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in index 7b9b69063c..daff73b371 100644 --- a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in @@ -1,5 +1,5 @@ -#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ -#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_ /* //@HEADER // ************************************************************************ diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in index fc47564161..8e8ca17113 100644 --- a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in @@ -1,5 +1,5 @@ -#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ /* //@HEADER // ************************************************************************ From 27b45c1fb180b77f424e1fa9082e4167ebd6a7e6 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 9 May 2022 09:18:03 -0600 Subject: [PATCH 134/261] D1 coloring: remove unused but set variable --- src/graph/impl/KokkosGraph_Distance1Color_impl.hpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 87d3c193cd..64873708b5 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -411,7 +411,6 @@ class GraphColor_VB nnz_lno_t numUncolored = this->nv; - double t, total = 0.0; double total_time_greedy_phase = 0.0; double total_time_find_conflicts = 0.0; double total_time_serial_conflict_resolution = 0.0; @@ -435,8 +434,7 @@ class GraphColor_VB MyExecSpace().fence(); if (this->_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_greedy_phase += t; std::cout << "\tTime speculative greedy phase " << iter << " : " << t << std::endl; @@ -459,8 +457,7 @@ class GraphColor_VB MyExecSpace().fence(); if (_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_find_conflicts += t; std::cout << "\tTime conflict detection " << iter << " : " << t << std::endl; @@ -500,8 +497,7 @@ class GraphColor_VB } MyExecSpace().fence(); if (_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_serial_conflict_resolution += t; std::cout << "\tTime serial conflict resolution: " << t << std::endl; } From 9d19a7427610c7f6c943cede3cfbdef14fa4891d Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 11 May 2022 22:12:43 -0700 Subject: [PATCH 135/261] Fix nnz calculation --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 18 +++++----- .../KokkosSparse_spiluk_symbolic_impl.hpp | 35 ++++++++++--------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 8148730f65..98e1a38539 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -492,6 +492,11 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); }); + //Kokkos::single(Kokkos::PerTeam(team),[&] () { + // if (temp_nnz_cnt > shmem_key_size) + // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, shmem_key_size); + //}); + team.team_barrier(); //Unpack the ith row of A @@ -571,9 +576,9 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor //} } - nnz_lno_t team_shmem_size(int /* team_size */) const { - return shmem_size; - } + //nnz_lno_t team_shmem_size(int /* team_size */) const { + // return shmem_size; + //} }; template ; - ////using scratch_space = typename execution_space::scratch_memory_space; - ////using view_type_1d_scratch = Kokkos::View; nnz_lno_t shmem_hash_size = static_cast(level_shmem_hash_size(lvl)); nnz_lno_t shmem_key_size = static_cast(level_shmem_key_size(lvl)); @@ -637,7 +640,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, //shmem needs the first 2 entries for sizes nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); - ////nnz_lno_t shmem_size = view_type_1d_scratch::shmem_size(2 + shmem_hash_size + shmem_key_size * 3); int team_size = thandle.get_team_size(); ILUKLvlSchedTP1HashMapNumericFunctor> 1) << 1; - + level_shmem_hash_size(i) = shmem_hash_size; level_shmem_key_size(i) = shmem_key_size; - if ((i < 20)|| (i >= (nlevels-20))) { - std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) << " rows"; - std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); - std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); - std::cout << ", shmem_key_size: " << level_shmem_key_size(i); - std::cout << ", shared_memory_hash_func: " << level_shmem_hash_size(i)-1; - std::cout << ", shmem_size: " << (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); - std::cout << std::endl; - } + //if ((i < 20)|| (i >= (nlevels-20))) { + // std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) << " rows"; + // std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); + // std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); + // std::cout << ", shmem_key_size: " << level_shmem_key_size(i); + // std::cout << ", shared_memory_hash_func: " << level_shmem_hash_size(i)-1; + // std::cout << ", shmem_size: " << (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); + // std::cout << std::endl; + //} } thandle.set_num_levels(nlevels); From f911f45e4994003df847c8ca3dcfd05aef0a7472 Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Wed, 11 May 2022 23:27:10 -0600 Subject: [PATCH 136/261] Apply clang format --- src/sparse/KokkosSparse_spiluk_handle.hpp | 36 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 359 +++++++++--------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 131 ++++--- 3 files changed, 280 insertions(+), 246 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index fc15b6f4a7..1bf520c02b 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -108,9 +108,12 @@ class SPILUKHandle { nnz_lno_view_t level_nchunks; // number of chunks of rows at each level nnz_lno_view_t level_nrowsperchunk; // maximum number of rows among chunks at each level - nnz_row_view_host_t level_maxnnzperrow; //maximum number of nnz per row at each level - nnz_row_view_host_t level_shmem_hash_size;//hash size in the shared memory hash map at each level - nnz_row_view_host_t level_shmem_key_size; //key size in the shared memory hash map at each level + nnz_row_view_host_t + level_maxnnzperrow; // maximum number of nnz per row at each level + nnz_row_view_host_t level_shmem_hash_size; // hash size in the shared memory + // hash map at each level + nnz_row_view_host_t level_shmem_key_size; // key size in the shared memory + // hash map at each level size_type nrows; size_type nlevels; @@ -162,10 +165,9 @@ class SPILUKHandle { level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), - level_maxnnzperrow = nnz_row_view_host_t(), + level_maxnnzperrow = nnz_row_view_host_t(), level_shmem_hash_size = nnz_row_view_host_t(), - level_shmem_key_size = nnz_row_view_host_t(), - reset_symbolic_complete(); + level_shmem_key_size = nnz_row_view_host_t(), reset_symbolic_complete(); } virtual ~SPILUKHandle(){}; @@ -198,24 +200,32 @@ class SPILUKHandle { } KOKKOS_INLINE_FUNCTION - nnz_row_view_host_t get_level_maxnnzperrow() const { return level_maxnnzperrow; } + nnz_row_view_host_t get_level_maxnnzperrow() const { + return level_maxnnzperrow; + } void alloc_level_maxnnzperrow(const size_type nlevels_) { level_maxnnzperrow = nnz_row_view_host_t("level_maxnnzperrow", nlevels_); } KOKKOS_INLINE_FUNCTION - nnz_row_view_host_t get_level_shmem_hash_size() const { return level_shmem_hash_size; } + nnz_row_view_host_t get_level_shmem_hash_size() const { + return level_shmem_hash_size; + } void alloc_level_shmem_hash_size(const size_type nlevels_) { - level_shmem_hash_size = nnz_row_view_host_t("level_shmem_hash_size", nlevels_); + level_shmem_hash_size = + nnz_row_view_host_t("level_shmem_hash_size", nlevels_); } KOKKOS_INLINE_FUNCTION - nnz_row_view_host_t get_level_shmem_key_size() const { return level_shmem_key_size; } + nnz_row_view_host_t get_level_shmem_key_size() const { + return level_shmem_key_size; + } void alloc_level_shmem_key_size(const size_type nlevels_) { - level_shmem_key_size = nnz_row_view_host_t("level_shmem_key_size", nlevels_); + level_shmem_key_size = + nnz_row_view_host_t("level_shmem_key_size", nlevels_); } KOKKOS_INLINE_FUNCTION @@ -273,7 +283,7 @@ class SPILUKHandle { if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; - if ( algm == SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP ) + if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) std::cout << "SEQLVLSCHD_TP1HASHMAP" << std::endl; /* if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { @@ -291,7 +301,7 @@ class SPILUKHandle { return SPILUKAlgorithm::SEQLVLSCHD_RP; else if (name == "SPILUK_TEAMPOLICY1") return SPILUKAlgorithm::SEQLVLSCHD_TP1; - else if (name=="SPILUK_TEAMPOLICY1HASHMAP") + else if (name == "SPILUK_TEAMPOLICY1HASHMAP") return SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP; /*else if(name=="SPILUK_TEAMPOLICY2") return * SPILUKAlgorithm::SEQLVLSCHED_TP2;*/ diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 98e1a38539..2c3c8dd1c2 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -369,70 +369,78 @@ struct ILUKLvlSchedTP1NumericFunctor { } }; -template -struct ILUKLvlSchedTP1HashMapNumericFunctor -{ +template +struct ILUKLvlSchedTP1HashMapNumericFunctor { using execution_space = typename ARowMapType::execution_space; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using size_type = typename ARowMapType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type ; - using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator - ; - - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; + using scalar_t = typename AValuesType::non_const_value_type; + using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator< + nnz_lno_t, nnz_lno_t, nnz_lno_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd>; + + ARowMapType A_row_map; + AEntriesType A_entries; + AValuesType A_values; + LRowMapType L_row_map; + LEntriesType L_entries; + LValuesType L_values; + URowMapType U_row_map; + UEntriesType U_entries; + UValuesType U_values; LevelViewType level_idx; - nnz_lno_t lev_start; - nnz_lno_t shmem_hash_size; - nnz_lno_t shmem_key_size; - nnz_lno_t shared_memory_hash_func; - nnz_lno_t shmem_size; - - ILUKLvlSchedTP1HashMapNumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, const AValuesType &A_values_, - const LRowMapType &L_row_map_, const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, UValuesType &U_values_, - const LevelViewType &level_idx_, const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, - const nnz_lno_t &shmem_key_size_, const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) : - A_row_map(A_row_map_), A_entries(A_entries_), A_values(A_values_), - L_row_map(L_row_map_), L_entries(L_entries_), L_values(L_values_), - U_row_map(U_row_map_), U_entries(U_entries_), U_values(U_values_), - level_idx(level_idx_), lev_start(lev_start_), shmem_hash_size(shmem_hash_size_), - shmem_key_size(shmem_key_size_), shared_memory_hash_func(shared_memory_hash_func_), - shmem_size(shmem_size_) {} + nnz_lno_t lev_start; + nnz_lno_t shmem_hash_size; + nnz_lno_t shmem_key_size; + nnz_lno_t shared_memory_hash_func; + nnz_lno_t shmem_size; + + ILUKLvlSchedTP1HashMapNumericFunctor( + const ARowMapType &A_row_map_, const AEntriesType &A_entries_, + const AValuesType &A_values_, const LRowMapType &L_row_map_, + const LEntriesType &L_entries_, LValuesType &L_values_, + const URowMapType &U_row_map_, const UEntriesType &U_entries_, + UValuesType &U_values_, const LevelViewType &level_idx_, + const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, + const nnz_lno_t &shmem_key_size_, + const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) + : A_row_map(A_row_map_), + A_entries(A_entries_), + A_values(A_values_), + L_row_map(L_row_map_), + L_entries(L_entries_), + L_values(L_values_), + U_row_map(U_row_map_), + U_entries(U_entries_), + U_values(U_values_), + level_idx(level_idx_), + lev_start(lev_start_), + shmem_hash_size(shmem_hash_size_), + shmem_key_size(shmem_key_size_), + shared_memory_hash_func(shared_memory_hash_func_), + shmem_size(shmem_size_) {} KOKKOS_INLINE_FUNCTION - void operator()( const member_type & team ) const { - auto my_league = team.league_rank(); // teamid - auto rowid = level_idx(my_league + lev_start);//teamid-->rowid - //auto my_team = team.team_rank(); + void operator()(const member_type &team) const { + auto my_league = team.league_rank(); // teamid + auto rowid = level_idx(my_league + lev_start); // teamid-->rowid + // auto my_team = team.team_rank(); - //START shared hash map initialization + // START shared hash map initialization char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); // Threads in a team share 4 arrays: begin, next, keys, values - // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd level hash right now) - volatile nnz_lno_t *used_hash_sizes = (volatile nnz_lno_t *)(all_shared_memory); + // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd + // level hash right now) + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); all_shared_memory += sizeof(nnz_lno_t) * 2; - //points to begin array + // points to begin array nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; @@ -445,13 +453,13 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory); - hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, keys, vals); + hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, + keys, vals); // initialize begins - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), [&](int i) { - begins[i] = -1; - }); - + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), + [&](int i) { begins[i] = -1; }); + // initialize hash usage sizes Kokkos::single(Kokkos::PerTeam(team), [&]() { used_hash_sizes[0] = 0; @@ -459,68 +467,78 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor }); team.team_barrier(); - //Shared hash map initialization DONE + // Shared hash map initialization DONE - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid+1); + auto k1 = L_row_map(rowid); + auto k2 = L_row_map(rowid + 1); #ifdef KEEP_DIAG - Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2-1 ), [&] ( const nnz_lno_t k ) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( + col, k, used_hash_sizes); + }); #else - Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); - }); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( + col, k, used_hash_sizes); + }); #endif #ifdef KEEP_DIAG - //if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); - Kokkos::single(Kokkos::PerTeam(team),[&] () { L_values(k2-1) = scalar_t(1.0); }); + // if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif team.team_barrier(); - k1 = U_row_map(rowid); - k2 = U_row_map(rowid+1); - Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr(col, k, used_hash_sizes); - }); - - //Kokkos::single(Kokkos::PerTeam(team),[&] () { + k1 = U_row_map(rowid); + k2 = U_row_map(rowid + 1); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( + col, k, used_hash_sizes); + }); + + // Kokkos::single(Kokkos::PerTeam(team),[&] () { // if (temp_nnz_cnt > shmem_key_size) - // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, shmem_key_size); + // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, + // shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, + // shmem_key_size); //}); team.team_barrier(); - - //Unpack the ith row of A + + // Unpack the ith row of A k1 = A_row_map(rowid); - k2 = A_row_map(rowid+1); - Kokkos::parallel_for( Kokkos::TeamThreadRange( team, k1, k2 ), [&] ( const nnz_lno_t k ) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t hashmap_idx = hm.find(col); - if (hashmap_idx != -1) { - nnz_lno_t ipos = hm.values[hashmap_idx]; - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - } - }); + k2 = A_row_map(rowid + 1); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t hashmap_idx = hm.find(col); + if (hashmap_idx != -1) { + nnz_lno_t ipos = hm.values[hashmap_idx]; + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + } + }); team.team_barrier(); - - //Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid+1); + + // Eliminate prev rows + k1 = L_row_map(rowid); + k2 = L_row_map(rowid + 1); #ifdef KEEP_DIAG - for (auto k = k1; k < k2-1; ++k) + for (auto k = k1; k < k2 - 1; ++k) #else for (auto k = k1; k < k2; ++k) #endif @@ -531,31 +549,34 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor #else auto fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - //if ( my_team == 0 ) L_values(k) = fact; - Kokkos::single(Kokkos::PerTeam(team),[&] () { L_values(k) = fact; }); + // if ( my_team == 0 ) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); - Kokkos::parallel_for( Kokkos::TeamThreadRange( team, U_row_map(prev_row)+1, U_row_map(prev_row+1) ), [&] ( const size_type kk ) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t hashmap_idx = hm.find(col); - if (hashmap_idx != -1) { - nnz_lno_t ipos = hm.values[hashmap_idx]; - auto lxu = -U_values(kk) * fact; - if (col < rowid) - //L_values(ipos) += lxu; - Kokkos::atomic_add (&L_values(ipos), lxu); - else - //U_values(ipos) += lxu; - Kokkos::atomic_add (&U_values(ipos), lxu); - } - });// end for kk + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, + U_row_map(prev_row + 1)), + [&](const size_type kk) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t hashmap_idx = hm.find(col); + if (hashmap_idx != -1) { + nnz_lno_t ipos = hm.values[hashmap_idx]; + auto lxu = -U_values(kk) * fact; + if (col < rowid) + // L_values(ipos) += lxu; + Kokkos::atomic_add(&L_values(ipos), lxu); + else + // U_values(ipos) += lxu; + Kokkos::atomic_add(&U_values(ipos), lxu); + } + }); // end for kk team.team_barrier(); - }// end for k + } // end for k - //if ( my_team == 0 ) { - Kokkos::single(Kokkos::PerTeam(team),[&] () { + // if ( my_team == 0 ) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { nnz_lno_t hashmap_idx = hm.find(rowid); if (hashmap_idx != -1) { nnz_lno_t ipos = hm.values[hashmap_idx]; @@ -576,7 +597,7 @@ struct ILUKLvlSchedTP1HashMapNumericFunctor //} } - //nnz_lno_t team_shmem_size(int /* team_size */) const { + // nnz_lno_t team_shmem_size(int /* team_size */) const { // return shmem_size; //} }; @@ -622,55 +643,50 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, Kokkos::deep_copy(level_ptr_h, level_ptr); if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); auto level_shmem_key_size = thandle.get_level_shmem_key_size(); - - for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { + + for (size_type lvl = 0; lvl < nlevels; ++lvl) { nnz_lno_t lev_start = level_ptr_h(lvl); - nnz_lno_t lev_end = level_ptr_h(lvl+1); - - if ( (lev_end - lev_start) != 0 ) { + nnz_lno_t lev_end = level_ptr_h(lvl + 1); + + if ((lev_end - lev_start) != 0) { using policy_type = Kokkos::TeamPolicy; - nnz_lno_t shmem_hash_size = static_cast(level_shmem_hash_size(lvl)); - nnz_lno_t shmem_key_size = static_cast(level_shmem_key_size(lvl)); - - nnz_lno_t shared_memory_hash_func = shmem_hash_size - 1;//for AND operation we use -1 + nnz_lno_t shmem_hash_size = + static_cast(level_shmem_hash_size(lvl)); + nnz_lno_t shmem_key_size = + static_cast(level_shmem_key_size(lvl)); + + nnz_lno_t shared_memory_hash_func = + shmem_hash_size - 1; // for AND operation we use -1 - //shmem needs the first 2 entries for sizes - nnz_lno_t shmem_size = (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); + // shmem needs the first 2 entries for sizes + nnz_lno_t shmem_size = + (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); int team_size = thandle.get_team_size(); - ILUKLvlSchedTP1HashMapNumericFunctor tstf(A_row_map, A_entries, A_values, - L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, - level_idx, lev_start, - shmem_hash_size, shmem_key_size, - shared_memory_hash_func, shmem_size); - if ( team_size == -1 ) { - policy_type team_policy(lev_end - lev_start , Kokkos::AUTO); + ILUKLvlSchedTP1HashMapNumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, level_idx, lev_start, + shmem_hash_size, shmem_key_size, shared_memory_hash_func, + shmem_size); + if (team_size == -1) { + policy_type team_policy(lev_end - lev_start, Kokkos::AUTO); team_policy.set_scratch_size(0, Kokkos::PerTeam(shmem_size)); Kokkos::parallel_for("parfor_l_team", team_policy, tstf); - } - else { - policy_type team_policy(lev_end - lev_start , team_size); + } else { + policy_type team_policy(lev_end - lev_start, team_size); team_policy.set_scratch_size(0, Kokkos::PerTeam(shmem_size)); Kokkos::parallel_for("parfor_l_team", team_policy, tstf); } - } // end if - } // end for lvl - }//End SEQLVLSCHD_TP1HASHMAP + } // end if + } // end for lvl + } // End SEQLVLSCHD_TP1HASHMAP else { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { @@ -691,13 +707,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, thandle.get_level_maxrows(), nrows); Kokkos::deep_copy(iw, nnz_lno_t(-1)); } - + // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead for (size_type lvl = 0; lvl < nlevels; ++lvl) { nnz_lno_t lev_start = level_ptr_h(lvl); nnz_lno_t lev_end = level_ptr_h(lvl + 1); - + if ((lev_end - lev_start) != 0) { if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { @@ -707,14 +723,16 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, ILUKLvlSchedRPNumericFunctor< ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( - A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, iw, lev_start)); + UValuesType, HandleDeviceEntriesType, WorkViewType, + nnz_lno_t>(A_row_map, A_entries, A_values, L_row_map, + L_entries, L_values, U_row_map, U_entries, + U_values, level_idx, iw, lev_start)); } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + KokkosSparse::Experimental::SPILUKAlgorithm:: + SEQLVLSCHD_TP1) { using policy_type = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); - + nnz_lno_t lvl_rowid_start = 0; nnz_lno_t lvl_nrows_chunk; for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { @@ -723,23 +741,24 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; else lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - + ILUKLvlSchedTP1NumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + ARowMapType, AEntriesType, AValuesType, LRowMapType, + LEntriesType, LValuesType, URowMapType, UEntriesType, + UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, level_idx, iw, lev_start + lvl_rowid_start); - + if (team_size == -1) Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nrows_chunk, Kokkos::AUTO), tstf); else Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, team_size), tstf); - + policy_type(lvl_nrows_chunk, team_size), + tstf); + lvl_rowid_start += lvl_nrows_chunk; } } diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index c40d8cb68c..18e0e54eef 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -219,18 +219,15 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, level_nrowsperchunk = lnrowsperchunk; } -template -void level_sched_hashmap ( IlukHandle& thandle, - const LRowMapType L_row_map, const LEntriesType L_entries, - const URowMapType U_row_map, const UEntriesType U_entries, - LevelType1& level_list, LevelType2& level_ptr, LevelType2& level_idx, size_type &nlevels ) { +template +void level_sched_hashmap(IlukHandle& thandle, const LRowMapType L_row_map, + const LEntriesType L_entries, + const URowMapType U_row_map, + const UEntriesType U_entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host using nnz_lno_t = typename IlukHandle::nnz_lno_t; @@ -240,71 +237,75 @@ void level_sched_hashmap ( IlukHandle& thandle, nlevels = 0; level_ptr(0) = 0; - for ( size_type i = 0; i < nrows; ++i ) { - size_type l = 0; - size_type rowstart= L_row_map(i); - size_type rowend = L_row_map(i+1); - for ( size_type j = rowstart; j < rowend; ++j ) { + for (size_type i = 0; i < nrows; ++i) { + size_type l = 0; + size_type rowstart = L_row_map(i); + size_type rowend = L_row_map(i + 1); + for (size_type j = rowstart; j < rowend; ++j) { nnz_lno_t col = L_entries(j); - l = std::max(l, level_list(col)); + l = std::max(l, level_list(col)); } - level_list(i) = l+1; - level_ptr(l+1) += 1; - nlevels = std::max(nlevels, l+1); + level_list(i) = l + 1; + level_ptr(l + 1) += 1; + nlevels = std::max(nlevels, l + 1); } - - for ( size_type i = 1; i <= nlevels; ++i ) { - level_ptr(i) += level_ptr(i-1); + + for (size_type i = 1; i <= nlevels; ++i) { + level_ptr(i) += level_ptr(i - 1); } - - for ( size_type i = 0; i < nrows; i++ ) { - level_idx(level_ptr(level_list(i)-1)) = i; - level_ptr(level_list(i)-1) += 1; + + for (size_type i = 0; i < nrows; i++) { + level_idx(level_ptr(level_list(i) - 1)) = i; + level_ptr(level_list(i) - 1) += 1; } - - if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0. - for ( size_type i = nlevels-1; i > 0; --i ) { - level_ptr(i) = level_ptr(i-1); + + if (nlevels > 0) { // note: to avoid wrapping around to the max of size_t + // when nlevels = 0. + for (size_type i = nlevels - 1; i > 0; --i) { + level_ptr(i) = level_ptr(i - 1); } } - + level_ptr(0) = 0; - - //Find the maximum number of nnz per row per level - //Determine shmem hash size and key size + + // Find the maximum number of nnz per row per level + // Determine shmem hash size and key size //(max. number of non-zeros in both L and U) size_type maxrows = 0; - + thandle.alloc_level_maxnnzperrow(nlevels); thandle.alloc_level_shmem_hash_size(nlevels); thandle.alloc_level_shmem_key_size(nlevels); - + auto level_maxnnzperrow = thandle.get_level_maxnnzperrow(); auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); auto level_shmem_key_size = thandle.get_level_shmem_key_size(); - - for ( size_type i = 0; i < nlevels; i++ ) { - size_type lnrows = level_ptr(i+1) - level_ptr(i); - if( maxrows < lnrows ) { + + for (size_type i = 0; i < nlevels; i++) { + size_type lnrows = level_ptr(i + 1) - level_ptr(i); + if (maxrows < lnrows) { maxrows = lnrows; } - //Determine the number of non-zeros in each level - size_type r_s = level_ptr(i); - size_type r_e = level_ptr(i+1); - size_type lnnz = 0; + // Determine the number of non-zeros in each level + size_type r_s = level_ptr(i); + size_type r_e = level_ptr(i + 1); + size_type lnnz = 0; size_type lmaxnnz = 0; - for (size_type r = r_s; r < r_e; r++) {//Look at each row in a level - auto rid = level_idx(r); //get actual rowid in the level - size_type rnnz = (L_row_map(rid+1) - L_row_map(rid)) + - (U_row_map(rid+1) - U_row_map(rid));//count the number of non-zeros in the current row (both L and U) - lnnz += rnnz;//accumulate to count the nnz in the current level - if( lmaxnnz < rnnz ) { + for (size_type r = r_s; r < r_e; r++) { // Look at each row in a level + auto rid = level_idx(r); // get actual rowid in the level + size_type rnnz = (L_row_map(rid + 1) - L_row_map(rid)) + + (U_row_map(rid + 1) - + U_row_map(rid)); // count the number of non-zeros in + // the current row (both L and U) + lnnz += rnnz; // accumulate to count the nnz in the current level + if (lmaxnnz < rnnz) { lmaxnnz = rnnz; } } level_maxnnzperrow(i) = lmaxnnz; - - size_type shmem_key_size = lmaxnnz;//the number of keys can a team (row) hold + + size_type shmem_key_size = + lmaxnnz; // the number of keys can a team (row) hold // put the hash size closest power of 2. // we round down here, because we want to store more keys, @@ -315,21 +316,25 @@ void level_sched_hashmap ( IlukHandle& thandle, } // increase the key size with the left over from hash size. - shmem_key_size = shmem_key_size + (shmem_key_size - shmem_hash_size) / 3; //note: divided by 3 because nexts, keys, values have sizes of shmem_key_size + shmem_key_size = + shmem_key_size + (shmem_key_size - shmem_hash_size) / + 3; // note: divided by 3 because nexts, keys, + // values have sizes of shmem_key_size // round it down to 2 and multiply by 2, because of some alignment issues. shmem_key_size = (shmem_key_size >> 1) << 1; level_shmem_hash_size(i) = shmem_hash_size; level_shmem_key_size(i) = shmem_key_size; - - //if ((i < 20)|| (i >= (nlevels-20))) { - // std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) << " rows"; - // std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); + + // if ((i < 20)|| (i >= (nlevels-20))) { + // std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) + // << " rows"; std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); // std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); // std::cout << ", shmem_key_size: " << level_shmem_key_size(i); - // std::cout << ", shared_memory_hash_func: " << level_shmem_hash_size(i)-1; - // std::cout << ", shmem_size: " << (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); - // std::cout << std::endl; + // std::cout << ", shared_memory_hash_func: " << + // level_shmem_hash_size(i)-1; std::cout << ", shmem_size: " << (2 + + // shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); std::cout << + // std::endl; //} } @@ -595,7 +600,7 @@ void iluk_symbolic(IlukHandle& thandle, level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, level_list, level_ptr, level_idx, nlev); } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, level_nchunks, level_nrowsperchunk, nlev); From cb1afe1b393abcdd88ce7997e637c478b0645d5f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 10 May 2022 17:06:10 -0600 Subject: [PATCH 137/261] src/sparse: Fix & check for fence post errors --- .github/workflows/osx.yml | 1 + src/sparse/KokkosSparse_csc2csr.hpp | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index e4e5a33719..e1f391ee9e 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -46,6 +46,7 @@ jobs: -DCMAKE_CXX_FLAGS="-Werror" \ -DCMAKE_CXX_STANDARD=14 \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=ON \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 5b85671587..49f84f15da 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -109,8 +109,7 @@ class Csc2Csr { // Use exclusive scan so we can allocate the row map uninitialized and // avoid accessing device views on the host. KE::exclusive_scan(crsET, KE::cbegin(__crs_row_cnt), - KE::cend(__crs_row_cnt) + 1, KE::begin(__crs_row_map), - 0); + KE::cend(__crs_row_cnt), KE::begin(__crs_row_map), 0); CrsET().fence(); Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map); CrsET().fence(); @@ -203,7 +202,7 @@ class Csc2Csr { __crs_col_ids = CrsColIdViewType( Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_col_ids"), nnz); - __crs_row_cnt = RowIdViewType("__crs_row_cnt", __nrows); + __crs_row_cnt = RowIdViewType("__crs_row_cnt", __nrows + 1); __Functor functor( __nrows, __ncols, __nnz, __vals, __crs_vals, __row_ids, __crs_row_map, From e137231313257d8f01d23b4308aa4ee33fb8cd6f Mon Sep 17 00:00:00 2001 From: kliegeois Date: Thu, 12 May 2022 13:29:56 -0600 Subject: [PATCH 138/261] Address #1409 format --- src/batched/dense/KokkosBatched_Gesv.hpp | 35 ++-- src/batched/dense/KokkosBatched_LU_Decl.hpp | 3 + .../dense/impl/KokkosBatched_Gesv_Impl.hpp | 160 +++++++++++------- 3 files changed, 120 insertions(+), 78 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Gesv.hpp b/src/batched/dense/KokkosBatched_Gesv.hpp index 08ad9644a0..cda2225c43 100644 --- a/src/batched/dense/KokkosBatched_Gesv.hpp +++ b/src/batched/dense/KokkosBatched_Gesv.hpp @@ -62,16 +62,15 @@ struct Gesv { /// using a batched LU decomposition, 2 batched triangular solves, and a batched /// static pivoting. /// -/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view /// \tparam VectorType: Input type for the right-hand side and the solution, -/// needs to be a 2D view +/// needs to be a 1D view /// -/// \param A [in]: batched matrix, a rank 3 view -/// \param X [out]: solution, a rank 2 view -/// \param B [in]: right-hand side, a rank 2 view -/// \param tmp [in]: a rank 3 view used to store temporary variable; dimension -/// must be N x n x (n+4) where N is the batched size and n is the number of -/// rows. +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view +/// \param tmp [in]: a rank 2 view used to store temporary variable; dimension +/// must be n x (n+4) where n is the number of rows. /// /// /// Two versions are available (those are chosen based on ArgAlgo): @@ -103,14 +102,14 @@ struct SerialGesv { /// using a batched LU decomposition, 2 batched triangular solves, and a batched /// static pivoting. /// -/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view /// \tparam VectorType: Input type for the right-hand side and the solution, -/// needs to be a 2D view +/// needs to be a 1D view /// /// \param member [in]: TeamPolicy member -/// \param A [in]: batched matrix, a rank 3 view -/// \param X [out]: solution, a rank 2 view -/// \param B [in]: right-hand side, a rank 2 view +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view /// /// Two versions are available (those are chosen based on ArgAlgo): /// @@ -141,14 +140,14 @@ struct TeamGesv { /// using a batched LU decomposition, 2 batched triangular solves, and a batched /// static pivoting. /// -/// \tparam MatrixType: Input type for the matrix, needs to be a 3D view +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view /// \tparam VectorType: Input type for the right-hand side and the solution, -/// needs to be a 2D view +/// needs to be a 1D view /// /// \param member [in]: TeamPolicy member -/// \param A [in]: batched matrix, a rank 3 view -/// \param X [out]: solution, a rank 2 view -/// \param B [in]: right-hand side, a rank 2 view +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view /// /// Two versions are available (those are chosen based on ArgAlgo): /// diff --git a/src/batched/dense/KokkosBatched_LU_Decl.hpp b/src/batched/dense/KokkosBatched_LU_Decl.hpp index 8cffbdc766..9fa2e2b6e3 100644 --- a/src/batched/dense/KokkosBatched_LU_Decl.hpp +++ b/src/batched/dense/KokkosBatched_LU_Decl.hpp @@ -51,4 +51,7 @@ struct LU { } // namespace KokkosBatched +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_LU_Team_Impl.hpp" + #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 5a07a58990..616df45df9 100644 --- a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -446,16 +446,20 @@ struct SerialGesv { return 1; } - SerialLU::invoke(PDAD); + int r_val = SerialLU::invoke(PDAD); - SerialTrsm::invoke(1.0, PDAD, PDY); + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, PDAD, PDY); - SerialTrsm::invoke(1.0, PDAD, PDY); + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, PDAD, PDY); - SerialHadamard1D(PDY, D2, X); - return 0; + if (r_val == 0) SerialHadamard1D(PDY, D2, X); + return r_val; } }; @@ -489,16 +493,21 @@ struct SerialGesv { } #endif - SerialLU::invoke(A); + int r_val = SerialLU::invoke(A); - SerialCopy::invoke(Y, X); - SerialTrsm::invoke(1.0, A, X); + if (r_val == 0) r_val = SerialCopy::invoke(Y, X); - SerialTrsm::invoke(1.0, A, X); + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, A, X); - return 0; + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, A, X); + + return r_val; } }; @@ -557,22 +566,31 @@ struct TeamGesv { } member.team_barrier(); - TeamLU::invoke(member, PDAD); + int r_val = + TeamLU::invoke(member, PDAD); member.team_barrier(); - TeamTrsm::invoke(member, 1.0, PDAD, - PDY); - member.team_barrier(); + if (r_val == 0) { + r_val = TeamTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } - TeamTrsm::invoke(member, 1.0, PDAD, - PDY); - member.team_barrier(); + if (r_val == 0) { + r_val = + TeamTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } - TeamHadamard1D(member, PDY, D2, X); - member.team_barrier(); - return 0; + if (r_val == 0) { + TeamHadamard1D(member, PDY, D2, X); + member.team_barrier(); + } + + return r_val; } }; @@ -605,21 +623,28 @@ struct TeamGesv { } #endif - TeamLU::invoke(member, A); + int r_val = TeamLU::invoke(member, A); member.team_barrier(); - TeamCopy::invoke(member, Y, X); - member.team_barrier(); + if (r_val == 0) { + TeamCopy::invoke(member, Y, X); + member.team_barrier(); + } - TeamTrsm::invoke(member, 1.0, A, X); - member.team_barrier(); + if (r_val == 0) { + TeamTrsm::invoke(member, 1.0, A, X); + member.team_barrier(); + } - TeamTrsm::invoke(member, 1.0, A, X); - member.team_barrier(); + if (r_val == 0) { + TeamTrsm::invoke(member, 1.0, A, + X); + member.team_barrier(); + } - return 0; + return r_val; } }; @@ -679,22 +704,31 @@ struct TeamVectorGesv { member.team_barrier(); - TeamLU::invoke(member, PDAD); + int r_val = + TeamLU::invoke(member, PDAD); member.team_barrier(); - TeamVectorTrsm::invoke(member, 1.0, - PDAD, PDY); - member.team_barrier(); + if (r_val == 0) { + TeamVectorTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } - TeamVectorTrsm::invoke(member, 1.0, - PDAD, PDY); - member.team_barrier(); + if (r_val == 0) { + TeamVectorTrsm::invoke(member, + 1.0, PDAD, + PDY); + member.team_barrier(); + } - TeamVectorHadamard1D(member, PDY, D2, X); - member.team_barrier(); - return 0; + if (r_val == 0) { + TeamVectorHadamard1D(member, PDY, D2, X); + member.team_barrier(); + } + + return r_val; } }; @@ -727,23 +761,29 @@ struct TeamVectorGesv { } #endif - TeamLU::invoke(member, A); + int r_val = TeamLU::invoke(member, A); member.team_barrier(); - TeamVectorCopy::invoke(member, Y, X); - member.team_barrier(); + if (r_val == 0) { + TeamVectorCopy::invoke(member, Y, X); + member.team_barrier(); + } - TeamVectorTrsm::invoke(member, 1.0, A, - X); - member.team_barrier(); + if (r_val == 0) { + TeamVectorTrsm::invoke(member, 1.0, + A, X); + member.team_barrier(); + } - TeamVectorTrsm::invoke(member, 1.0, - A, X); - member.team_barrier(); + if (r_val == 0) { + TeamVectorTrsm::invoke(member, + 1.0, A, X); + member.team_barrier(); + } - return 0; + return r_val; } }; From efccd275ad70f9dd1db97e3ecf4b8dbb6562b934 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 13 May 2022 08:06:58 -0600 Subject: [PATCH 139/261] .github/workflows: - Disable bounds check for serial debug - Increase ctest timeout from 2500s to 1hr --- .github/workflows/osx.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index e1f391ee9e..78bdc2d681 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -16,12 +16,16 @@ jobs: include: - backend: "SERIAL" cmake_build_type: "RelWithDebInfo" + debug_bounds_check: "ON" - backend: "THREADS" cmake_build_type: "RelWithDebInfo" + debug_bounds_check: "ON" - backend: "SERIAL" cmake_build_type: "Debug" + debug_bounds_check: "OFF" - backend: "SERIAL" cmake_build_type: "Release" + debug_bounds_check: "ON" steps: - name: checkout_kokkos_kernels @@ -46,7 +50,7 @@ jobs: -DCMAKE_CXX_FLAGS="-Werror" \ -DCMAKE_CXX_STANDARD=14 \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ - -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=ON \ + -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ @@ -86,4 +90,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure --timeout 2500 \ No newline at end of file + run: ctest -j2 --output-on-failure --timeout 3600 \ No newline at end of file From c98065ccaef5e9fe57dbe464daa52b3582baa88c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 13 May 2022 09:15:36 -0600 Subject: [PATCH 140/261] .github/workflows: Skip OSX when 'AT: WIP' exists --- .github/workflows/osx.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 78bdc2d681..7851db9dfb 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -2,12 +2,19 @@ name: github-OSX on: pull_request: - branches: - - master - - develop + types: [ opened, labeled, unlabeled, reopened, synchronize ] jobs: + check-pr-labels: + runs-on: [ubuntu-latest] + steps: + - uses: docker://agilepathway/pull-request-label-checker:latest + with: + none_of: 'AT: WIP' + repo_token: ${{ secrets.GITHUB_TOKEN }} osxci: + needs: check-pr-labels + # TODO: allow re-run via retest label if: ${{ github.event.label.name == 'AT: RETEST' }} name: osx-ci runs-on: [macos-latest] From e775acf2703bd3f98356adc24159153c59580fd9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 16 May 2022 13:42:35 -0600 Subject: [PATCH 141/261] SpMV: fixing issues with unit-tests tolerance Implementing a new formula to determine the level of accuracy to be expected when checking the correctness of the SpMV algorithm. Fudging a bit epsilon to allow tests to pass... The tensor core examples required a slightly different formula to take into account blockSize in the calculation of the largest possible error that can occur during SpMV. Also needed to add a 2x fudge factor, not very satisfying but definitely acceptable! --- unit_test/sparse/Test_Sparse_spmv.hpp | 524 ++++++++++++++++---------- 1 file changed, 322 insertions(+), 202 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 3cbe3d401d..6cc48c863b 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -22,6 +22,32 @@ typedef Kokkos::Experimental::half_t kokkos_half; namespace Test { +// Functor checking that the results of SPMV +// are consistent with a reference sequential +// implementation of the same operation. +// +// Inputs: +// - _ex_y the expected result calculated +// from the reference implementation +// - _y the result from optimized SPMV being +// tested for correctness +// - _eps the tolerance required to accept the +// results as correct +// - _max_val the largest possible value that can +// be stored as an intermediate result +// during the computation +// +// The criteria to assess correctness is +// abs(_ex_y - _y) / _max_val < tol +// +// Note: _max_val in the case of SPMV can be computed +// as follows. Find the max number of entries per +// row in the matrix (max_row_length), also find the +// largest value that can be stored in the matrix, x +// and y vectors (max_mat, max_x and max_y). +// +// _max_val = beta*max_y +// + alpha*max_row_length*max_mat*max_x template struct fSPMV { using value_type = int; @@ -32,21 +58,23 @@ struct fSPMV { VectorType0 expected_y; VectorType1 y; mag_type eps; + mag_type max_val; - fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps) - : expected_y(_ex_y), y(_y), eps(_eps) {} + fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps, + const mag_type _max_val = ATM::one()) + : expected_y(_ex_y), + y(_y), + eps(AT::abs(_eps)), + max_val(AT::abs(_max_val)) {} KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &err) const { - const mag_type error = - AT::abs(expected_y(i) - y(i)) / (AT::abs(expected_y(i)) > ATM::zero() - ? AT::abs(expected_y(i)) - : ATM::one()); + const mag_type error = AT::abs(expected_y(i) - y(i)); - if (error > eps) { + if (error > eps * max_val) { err++; - // printf("expected_y(%d)=%f, y(%d)=%f err=%f, eps=%f\n", i, - // AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps); + printf("expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); } } }; @@ -113,9 +141,12 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, } template -void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, char mode) { +void check_spmv( + crsMat_t input_mat, x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, char mode, + typename Kokkos::ArithTraits::mag_type + max_val) { // typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -123,11 +154,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using y_value_trait = Kokkos::ArithTraits; using y_value_mag_type = typename y_value_trait::mag_type; - // y is the quantity being tested here, - // so let us use y_value_type to determine - // the appropriate tolerance precision. const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + 10 * Kokkos::ArithTraits::eps(); bool transposed = (mode == 'T') || (mode == 'H'); y_vector_type expected_y( "expected", transposed ? input_mat.numCols() : input_mat.numRows()); @@ -150,7 +178,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); + fSPMV(expected_y, y, eps, max_val), + num_errors); if (num_errors > 0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors, y.extent_int(0), y_value_trait::abs(alpha), @@ -159,11 +188,13 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, } template -void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, - y_vector_type expected_y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV, - char mode) { +void check_spmv_mv( + crsMat_t input_mat, x_vector_type x, y_vector_type y, + y_vector_type expected_y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, int numMV, char mode, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -174,7 +205,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, // so let us use y_value_type to determine // the appropriate tolerance precision. const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + 10 * Kokkos::ArithTraits::eps(); Kokkos::deep_copy(expected_y, y); @@ -205,7 +236,8 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)), - fSPMV(y_i, y_spmv, eps), num_errors); + fSPMV(y_i, y_spmv, eps, max_val), + num_errors); if (num_errors > 0) std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) << " for mv " << i @@ -223,7 +255,9 @@ void check_spmv_struct( structure, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { + typename y_vector_type::non_const_value_type beta, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -233,9 +267,8 @@ void check_spmv_struct( // y is the quantity being tested here, // so let us use y_value_type to determine // the appropriate tolerance precision. - const double eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); + const double eps = Kokkos::ArithTraits::eps(); + const size_t nr = input_mat.numRows(); y_vector_type expected_y("expected", nr); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); @@ -247,13 +280,15 @@ void check_spmv_struct( int num_errors = 0; Kokkos::parallel_reduce( "KokkosKernels::UnitTests::spmv_struct", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); - if (num_errors > 0) + fSPMV(expected_y, y, eps, max_val), + num_errors); + if (num_errors > 0) { printf( "KokkosKernels::UnitTests::spmv_struct: %i errors of %i with params: " "%d %lf %lf\n", num_errors, y.extent_int(0), stencil_type, y_value_trait::abs(alpha), y_value_trait::abs(beta)); + } EXPECT_TRUE(num_errors == 0); } // check_spmv_struct @@ -265,7 +300,9 @@ void check_spmv_mv_struct( structure, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV) { + typename y_vector_type::non_const_value_type beta, int numMV, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -275,8 +312,7 @@ void check_spmv_mv_struct( // y is the quantity being tested here, // so let us use y_value_type to determine // the appropriate tolerance precision. - const double eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + const double eps = Kokkos::ArithTraits::eps(); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); @@ -295,7 +331,8 @@ void check_spmv_mv_struct( Kokkos::parallel_reduce( "KokkosKernels::UnitTests::spmv_mv_struct", my_exec_space(0, y.extent(0)), - fSPMV(y_i, y_spmv, eps), num_errors); + fSPMV(y_i, y_spmv, eps, max_val), + num_errors); if (num_errors > 0) printf( "KokkosKernels::UnitTests::spmv_mv_struct: %i errors of %i with " @@ -307,10 +344,13 @@ void check_spmv_mv_struct( } // check_spmv_mv_struct template -void check_spmv_controls(KokkosKernels::Experimental::Controls controls, - crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { +void check_spmv_controls( + KokkosKernels::Experimental::Controls controls, crsMat_t input_mat, + x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, + typename Kokkos::ArithTraits::mag_type + max_val) { // typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -339,7 +379,8 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); + fSPMV(expected_y, y, eps, max_val), + num_errors); if (num_errors > 0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors, y.extent_int(0), y_value_trait::abs(alpha), @@ -367,12 +408,16 @@ Kokkos::complex randomUpperBound>(int mag) { template void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(1); lno_t numCols = numRows; @@ -381,6 +426,9 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t nr = input_mat.numRows(); lno_t nc = input_mat.numCols(); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + x_vector_type input_x("x", nc); y_vector_type output_y("y", nr); x_vector_type input_xt("x", nr); @@ -389,13 +437,16 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(output_y, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(output_yt, rand_pool, randomUpperBound(max_y)); - Kokkos::fill_random(input_x, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(output_y, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(output_yt, rand_pool, randomUpperBound(1)); + // We also need to bound the values + // in the matrix to bound the cancellations + // coming from arithmetic operations. + Kokkos::fill_random(input_mat.values, rand_pool, + randomUpperBound(max_val)); std::vector nonTransModes = {'N'}; std::vector transModes = {'T'}; @@ -409,14 +460,21 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { - Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode); + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode, + max_error); } } } for (auto mode : transModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { - Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode); + // hoping the transpose won't have a long column... + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode, + max_error); } } } @@ -426,14 +484,18 @@ template void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy, int numMV) { - lno_t numCols = numRows; + using mag_t = typename Kokkos::ArithTraits::mag_type; - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(1); - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + lno_t numCols = numRows; + + using crsMat_t = typename KokkosSparse::CrsMatrix; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; ViewTypeX b_x("A", numRows, numMV); ViewTypeY b_y("B", numCols, numMV); @@ -445,14 +507,23 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_y, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_xt, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_yt, rand_pool, randomUpperBound(1)); + Kokkos::fill_random(b_x, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(b_y, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(b_xt, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(b_yt, rand_pool, randomUpperBound(max_y)); crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + + // We also need to bound the values + // in the matrix to bound the cancellations + // coming from arithmetic operations. + Kokkos::fill_random(input_mat.values, rand_pool, + randomUpperBound(max_val)); + Kokkos::deep_copy(b_y_copy, b_y); Kokkos::deep_copy(b_yt_copy, b_yt); @@ -468,16 +539,21 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, - mode); + mode, max_error); } } } for (auto mode : transModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + // hoping the transpose won't have a long column... + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, - numMV, mode); + numMV, mode, max_error); } } } @@ -487,18 +563,24 @@ template void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; + using mag_t = typename Kokkos::ArithTraits::mag_type; - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + constexpr mag_t max_x = static_cast(10); + constexpr mag_t max_y = static_cast(10); + constexpr mag_t max_val = static_cast(10); crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( numRows, numRows, nnz, row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool( 13718); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + for (int nv = 1; nv <= numMV; nv++) { ViewTypeX b_x("A", numRows, nv); ViewTypeY b_y("B", numRows, nv); @@ -506,22 +588,30 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::fill_random(b_x, rand_pool, scalar_t(10)); Kokkos::fill_random(b_y, rand_pool, scalar_t(10)); + Kokkos::fill_random(input_mat.values, rand_pool, scalar_t(10)); Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N', + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N', + max_y); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N', + max_y + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T', + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T', + max_y); // Testing all modes together, since matrix is square std::vector modes = {'N', 'C', 'T', 'H'}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; for (auto mode : modes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, - mode); + mode, max_error); } } } @@ -535,6 +625,11 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(2); Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; @@ -560,26 +655,31 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); + const mag_t max_error = max_y + 3 * max_val * max_x; - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0); - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0); - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0, + max_error); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0, + max_error); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0, + max_error); } template void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, lno_t verticalBC) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 2); structure(0) = nx; @@ -615,36 +715,44 @@ void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; - - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); - - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 1.0); - - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 1.0); + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + + { + constexpr mag_t max_val = static_cast(4); + constexpr mag_t max_error = max_y + 5 * max_val * max_x; + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 1.0, max_error); + } + + { + constexpr mag_t max_val = static_cast(8); + constexpr mag_t max_error = max_y + 9 * max_val * max_x; + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 1.0, max_error); + } } template void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, lno_t horizontal2BC, lno_t verticalBC) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 3); structure(0) = nx; @@ -688,35 +796,43 @@ void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; - - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); - - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 1.0); - - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 1.0); + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + + { + constexpr mag_t max_val = static_cast(6); + constexpr mag_t max_error = max_y + 7 * max_val * max_x; + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 1.0, max_error); + } + + { + constexpr mag_t max_val = static_cast(26); + constexpr mag_t max_error = max_y + 27 * max_val * max_x; + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 1.0, max_error); + } } template void test_spmv_mv_struct_1D(lno_t nx, int numMV) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef Kokkos::View x_multivector_type; - typedef Kokkos::View y_multivector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using x_multivector_type = Kokkos::View; + using y_multivector_type = Kokkos::View; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; @@ -739,20 +855,19 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_multivector_type::value_type ScalarX; - typedef typename y_multivector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_x, rand_pool, ScalarX(10)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(10)); + constexpr mag_t max_error = 5; Kokkos::deep_copy(output_y_copy, output_y); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 1.0, 0.0, numMV); + output_y_copy, 1.0, 0.0, numMV, max_error); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 0.0, 1.0, numMV); + output_y_copy, 0.0, 1.0, numMV, max_error); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 1.0, 1.0, numMV); + output_y_copy, 1.0, 1.0, numMV, max_error); } // check that the controls are flowing down correctly in the spmv kernel @@ -765,6 +880,11 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; using Controls = KokkosKernels::Experimental::Controls; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(10); + constexpr mag_t max_y = static_cast(10); + constexpr mag_t max_val = static_cast(10); lno_t numCols = numRows; @@ -779,17 +899,20 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - using ScalarX = typename x_vector_type::value_type; - using ScalarY = typename y_vector_type::value_type; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + Kokkos::fill_random(input_mat.values, rand_pool, max_val); - Kokkos::fill_random(input_x, rand_pool, ScalarX(10)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(10)); + const mag_t max_error = max_y + bandwidth * max_val * max_x; Controls controls; - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, + max_error); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, + max_error); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0, + max_error); } // test_spmv_controls // call it if ordinal int and, scalar float and double are instantiated. @@ -937,23 +1060,12 @@ void test_github_issue_101() { } } -#define EXECUTE_TEST_ISSUE_101(DEVICE) \ - TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ - test_github_issue_101(); \ - } - template CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows, typename CrsMat::ordinal_type &numCols, typename CrsMat::ordinal_type &blockSize) { -#if 0 - typedef typename CrsMat::StaticCrsGraphType::row_map_type::non_const_type ptr_type ; - typedef typename CrsMat::StaticCrsGraphType::entries_type::non_const_type ind_type ; - typedef typename CrsMat::values_type::non_const_type val_type ; - typedef typename CrsMat::size_type size_type; -#endif - typedef typename CrsMat::ordinal_type lno_t; - typedef typename CrsMat::value_type scalar_t; + using lno_t = typename CrsMat::ordinal_type; + using scalar_t = typename CrsMat::value_type; using Kokkos::HostSpace; using Kokkos::MemoryUnmanaged; @@ -1212,22 +1324,21 @@ template &pattern, const int m, const int n, - lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta) { + lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta, + const int max_blocks_per_row) { // get the widest passed scalar type // typedef typename std::conditional= sizeof(x_scalar_t), // a_scalar_t, x_scalar_t>::type wider_t; // typedef typename std::conditional= sizeof(y_scalar_t), // wider_t, y_scalar_t>::type widest_t; - typedef typename KokkosSparse::CrsMatrix - crs_mat_t; - typedef + using crs_mat_t = typename KokkosSparse::CrsMatrix; + using bsr_mat_t = typename KokkosSparse::Experimental::BsrMatrix - bsr_mat_t; - typedef Kokkos::View x_view_t; - typedef Kokkos::View y_view_t; + void, size_type>; + using x_view_t = Kokkos::View; + using y_view_t = Kokkos::View; using DeviceRangePolicy = Kokkos::RangePolicy; @@ -1248,23 +1359,19 @@ void test_spmv_bsrmatrix_controls_pattern( y_view_t test_y("test_y", m * blockSize, k); x_view_t test_x("test_x", n * blockSize, k); + constexpr x_scalar_t max_x = 10; + constexpr y_scalar_t max_y = 10; + constexpr a_scalar_t max_a = 10; + const double max_val = + beta * max_y + alpha * max_blocks_per_row * max_a * max_x; + // fill expected with random values Kokkos::Random_XorShift64_Pool rand_pool( 13718); Kokkos::fill_random(exp_x, rand_pool, - randomUpperBound(10)); + randomUpperBound(max_x)); Kokkos::fill_random(exp_y, rand_pool, - randomUpperBound(10)); - -#if 0 - // fill inputs with 1, for help debugging - Kokkos::parallel_for("fill", - Kokkos::MDRangePolicy>({0,0}, {hi_x.extent(0), hi_x.extent(1)}), - KOKKOS_LAMBDA (unsigned i, unsigned j) { - hi_x(i,j) = 1 + (i == 0 && j == 0); - } - ); -#endif + randomUpperBound(max_y)); // copy expected operands to test operands Kokkos::deep_copy(test_x, exp_x); @@ -1292,11 +1399,11 @@ void test_spmv_bsrmatrix_controls_pattern( // uses CUDA's half type, not Kokkos, so we still need a reduced precision // test. double eps = - KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; + 2 * KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc", DeviceRangePolicy(0, exp_y_i.extent(0)), Test::fSPMV( - exp_y_i, test_y_i, eps), + exp_y_i, test_y_i, eps, max_val), num_errors); // explicit cast to double since no overload for half::operator<< if (num_errors > 0) @@ -1318,13 +1425,14 @@ template void test_spmv_bsrmatrix_pattern(const std::vector &pattern, const int m, const int n, lno_t blockSize, - lno_t k, y_scalar_t alpha, y_scalar_t beta) { + lno_t k, y_scalar_t alpha, y_scalar_t beta, + const int max_blocks_per_row) { { KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "experimental_bsr_tc"); test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta); + controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } #if defined(KOKKOS_ARCH_AMPERE) @@ -1334,7 +1442,7 @@ void test_spmv_bsrmatrix_pattern(const std::vector &pattern, controls.setParameter("tc_precision", "double"); test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta); + controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } #endif } @@ -1352,69 +1460,76 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, { int m = 1; int n = 1; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(0, 0)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 1x1 empty { int m = 1; int n = 1; + int max_blocks_per_row = 0; std::vector pattern = {}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x2 top-left { int m = 2; int n = 2; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(0, 0)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x2 bottom right { int m = 2; int n = 2; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(1, 1)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x3 bottom right { int m = 2; int n = 3; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(1, 2)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x10 long bottom row { - int m = 2; - int n = 10; + int m = 2; + int n = 10; + int max_blocks_per_row = 10; std::vector pattern; for (int j = 0; j < n; ++j) { pattern.push_back(Coordinate(1, j)); } test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 10x10 column 1 + diagonal { - int m = 10; - int n = 10; + int m = 10; + int n = 10; + int max_blocks_per_row = 2; std::vector pattern; for (int i = 0; i < n; ++i) { pattern.push_back(Coordinate(i, 1)); @@ -1424,10 +1539,15 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, } test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } } +#define EXECUTE_TEST_ISSUE_101(DEVICE) \ + TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ + test_github_issue_101(); \ + } + #define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ From 4545cfbc25e1a3beb3adb303cf3ecf5baeb3f8a9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 10 May 2022 12:32:28 -0600 Subject: [PATCH 142/261] Kokkos_ArithTraits: re-implementation using Kokkos Core This change should not affect users directly as it is only an implementation change. Using the Kokkos math functions and numeric traits, the arithmetic traits are implemented in a more portable way. Use `digits` for `t` implementation Use `finite_{min,max}` to implement `{min,max}` Applying clang-format --- src/common/Kokkos_ArithTraits.hpp | 1804 +++++++++++++---------------- 1 file changed, 832 insertions(+), 972 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index bf7235e507..672cb6cc68 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -49,9 +49,11 @@ /// \brief Declaration and definition of Kokkos::Details::ArithTraits #include +#include +#include #include -#include #include +#include #ifdef HAVE_KOKKOSKERNELS_QUADMATH #include @@ -349,28 +351,28 @@ class ArithTraits { /// not work well with Kokkos. In that case, we use a mostly /// equivalent type here. For example, ArithTraits /// >::val_type is Kokkos::complex. - typedef T val_type; + using val_type = T; /// \brief The type of the magnitude (absolute value) of T. /// /// We define this as the type returned by abs() in this class. If /// T is real (not complex), then \c val_type and \c mag_type are /// usually the same. If T is std::complex for some R, /// then R and \c mag_type are usually the same. - typedef T mag_type; + using mag_type = T; //! Whether ArithTraits has a specialization for T. - static const bool is_specialized = false; + static constexpr bool is_specialized = false; //! Whether T is a signed type (has negative values). - static const bool is_signed = false; + static constexpr bool is_signed = false; //! Whether T is an integer type. - static const bool is_integer = false; + static constexpr bool is_integer = false; /// \brief Whether T "uses exact representations." /// /// The opposite of is_exact is "is approximate," that is, "may /// commit rounding error." - static const bool is_exact = false; + static constexpr bool is_exact = false; //! Whether T is a complex-valued type. - static const bool is_complex = false; + static constexpr bool is_complex = false; /// \brief Whether x is Inf. /// @@ -575,21 +577,21 @@ class ArithTraits { /// class, such as log() and pow(), are not in this section. //! Same as mag_type; the type of the absolute value (magnitude) of T. - typedef T magnitudeType; + using magnitudeType = T; /// \brief The type with "half the precision" of T. /// /// This typedef only makes sense if T is a floating-point type. - typedef T halfPrecision; + using halfPrecision = T; /// \brief The type with "twice the the precision" of T. /// /// This typedef only makes sense if T is a floating-point type. - typedef T doublePrecision; + using doublePrecision = T; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = false; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; /// \brief True if this type T has floating-point parameters. /// @@ -597,7 +599,7 @@ class ArithTraits { /// has "machine-specific" parameters eps(), sfmin(), base(), /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating /// to floating-point types. - static const bool hasMachineParameters = false; + static constexpr bool hasMachineParameters = false; //! Return relative machine precision. static KOKKOS_FORCEINLINE_FUNCTION mag_type eps(); @@ -656,18 +658,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef Kokkos::Experimental::half_t val_type; - typedef val_type mag_type; + using val_type = Kokkos::Experimental::half_t; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_half(HUGE_VALF); + return Kokkos::Experimental::cast_to_half( + Kokkos::Experimental::infinity::value); } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { @@ -684,13 +687,13 @@ class ArithTraits { } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return Kokkos::Experimental::cast_to_half( - fabs(Kokkos::Experimental::cast_from_half(x))); + Kokkos::abs(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_half(0.0F); + return Kokkos::Experimental::cast_to_half(0.0); } static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_half(1.0F); + return Kokkos::Experimental::cast_to_half(1.0); } static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); @@ -702,7 +705,7 @@ class ArithTraits { return x; } static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_half(0.0F); + return zero(); } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; @@ -710,104 +713,78 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_half( - ::pow(Kokkos::Experimental::cast_from_half(x), - Kokkos::Experimental::cast_from_half(y))); + Kokkos::pow(Kokkos::Experimental::cast_from_half(x), + Kokkos::Experimental::cast_from_half(y))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::sqrt(Kokkos::Experimental::cast_from_half(x))); + Kokkos::sqrt(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_half(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_half(x)) -#endif - ); + Kokkos::cbrt(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::exp(Kokkos::Experimental::cast_from_half(x))); + Kokkos::exp(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::log(Kokkos::Experimental::cast_from_half(x))); + Kokkos::log(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::log10(Kokkos::Experimental::cast_from_half(x))); + Kokkos::log10(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::sin(Kokkos::Experimental::cast_from_half(x))); + Kokkos::sin(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::cos(Kokkos::Experimental::cast_from_half(x))); + Kokkos::cos(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_half(x)) -#else - ::tan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); + Kokkos::tan(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::sinh(Kokkos::Experimental::cast_from_half(x))); + Kokkos::sinh(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::cosh(Kokkos::Experimental::cast_from_half(x))); + Kokkos::cosh(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { return Kokkos::Experimental::cast_to_half( - ::tanh(Kokkos::Experimental::cast_from_half(x))); + Kokkos::tanh(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_half(x)) -#else - ::asin(Kokkos::Experimental::cast_from_half(x)) -#endif - ); + Kokkos::asin(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_half(x)) -#else - ::acos(Kokkos::Experimental::cast_from_half(x)) -#endif - ); + Kokkos::acos(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_half(x)) -#else - ::atan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); + Kokkos::atan(Kokkos::Experimental::cast_from_half(x))); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; + using magnitudeType = mag_type; // C++ doesn't have a standard "half-float" type. - typedef val_type halfPrecision; - typedef double doublePrecision; + using halfPrecision = val_type; + using doublePrecision = double; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -822,12 +799,8 @@ class ArithTraits { return sqrt(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_half(CUDART_NAN_F); -#else return Kokkos::Experimental::cast_to_half( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ + Kokkos::Experimental::quiet_NaN::value); } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { @@ -846,9 +819,7 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_half(1.0); - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } static KOKKOS_FORCEINLINE_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } @@ -870,35 +841,30 @@ class ArithTraits { template <> class ArithTraits { public: - typedef Kokkos::Experimental::bhalf_t val_type; - typedef val_type mag_type; + using val_type = Kokkos::Experimental::bhalf_t; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_bhalf(HUGE_VALF); + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::Experimental::infinity::value); } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_bhalf(x)); + return Kokkos::isinf(Kokkos::Experimental::cast_from_bhalf(x)); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_bhalf(x)); + return Kokkos::isnan(Kokkos::Experimental::cast_from_bhalf(x)); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - fabs(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::abs(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return Kokkos::Experimental::cast_to_bhalf(0.0F); @@ -924,104 +890,79 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_bhalf( - ::pow(Kokkos::Experimental::cast_from_bhalf(x), - Kokkos::Experimental::cast_from_bhalf(y))); + Kokkos::pow(Kokkos::Experimental::cast_from_bhalf(x), + Kokkos::Experimental::cast_from_bhalf(y))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); + Kokkos::cbrt(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::exp(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::exp(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::log(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::log(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::log10(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::log10(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::sin(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::sin(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::cos(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::cos(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); + Kokkos::tan(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::sinh(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::sinh(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::cosh(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::cosh(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( - ::tanh(Kokkos::Experimental::cast_from_bhalf(x))); + Kokkos::tanh(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); + Kokkos::asin(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); + Kokkos::acos(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); + Kokkos::atan(Kokkos::Experimental::cast_from_bhalf(x))); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; + using magnitudeType = mag_type; // C++ doesn't have a standard "bhalf-float" type. - typedef val_type bhalfPrecision; - typedef double doublePrecision; + using bhalfPrecision = val_type; + using doublePrecision = double; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1036,12 +977,8 @@ class ArithTraits { return sqrt(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_bhalf(CUDART_NAN_F); -#else return Kokkos::Experimental::cast_to_bhalf( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ + Kokkos::Experimental::quiet_NaN::value); } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { @@ -1060,9 +997,7 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_bhalf(1.0); - } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } static KOKKOS_FORCEINLINE_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; } @@ -1081,165 +1016,155 @@ class ArithTraits { template <> class ArithTraits { public: - typedef float val_type; - typedef val_type mag_type; + using val_type = float; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } + static KOKKOS_FORCEINLINE_FUNCTION float infinity() { + return Kokkos::Experimental::infinity::value; + } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); + static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + return Kokkos::isinf(x); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const float x) { - return ::fabs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION float min() { return -FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION float max() { return FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const float) { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float conj(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION float pow(const float x, const float y) { - return ::pow(x, y); + static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + return Kokkos::isnan(x); } - static KOKKOS_FORCEINLINE_FUNCTION float sqrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + return Kokkos::abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION float cbrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0.0); } - static KOKKOS_FORCEINLINE_FUNCTION float exp(const float x) { - return ::exp(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1.0); } - static KOKKOS_FORCEINLINE_FUNCTION float log(const float x) { - return ::log(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION float log10(const float x) { - return ::log10(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION float sin(const float x) { - return ::sin(x); + static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + return x; } - static KOKKOS_FORCEINLINE_FUNCTION float cos(const float x) { - return ::cos(x); + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); } - static KOKKOS_FORCEINLINE_FUNCTION float tan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + return x; } - static KOKKOS_FORCEINLINE_FUNCTION float sinh(const float x) { - return ::sinh(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, + const val_type y) { + return Kokkos::pow(x, y); } - static KOKKOS_FORCEINLINE_FUNCTION float cosh(const float x) { - return ::cosh(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + return Kokkos::sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION float tanh(const float x) { - return ::tanh(x); + static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { + return Kokkos::cbrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION float asin(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { + return Kokkos::exp(x); } - static KOKKOS_FORCEINLINE_FUNCTION float acos(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { + return Kokkos::log(x); } - static KOKKOS_FORCEINLINE_FUNCTION float atan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif + static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { + return Kokkos::log10(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + return Kokkos::sin(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + return Kokkos::cos(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + return Kokkos::tan(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + return Kokkos::sinh(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + return Kokkos::cosh(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + return Kokkos::tanh(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + return Kokkos::asin(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + return Kokkos::acos(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + return Kokkos::atan(x); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return FLT_EPSILON; } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; + using magnitudeType = mag_type; // C++ doesn't have a standard "half-float" type. - typedef float halfPrecision; - typedef double doublePrecision; + using halfPrecision = float; + using doublePrecision = double; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const float x) { + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const float x) { + static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION float conjugate(const float x) { + static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { return conj(x); } static std::string name() { return "float"; } - static KOKKOS_FORCEINLINE_FUNCTION float squareroot(const float x) { + static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION float nan() { -#if defined(__CUDA_ARCH__) - return CUDART_NAN_F; - // return nan (); //this returns 0??? -#elif defined(__HIP_DEVICE_COMPILE__) - return ::nanf(""); -#else - return std::numeric_limits::quiet_NaN(); -#endif // __CUDA_ARCH__ + static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + return Kokkos::Experimental::quiet_NaN::value; } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return FLT_MIN; // ??? + return Kokkos::Experimental::norm_min::value; // ??? + } + static KOKKOS_FORCEINLINE_FUNCTION int base() { + return Kokkos::Experimental::radix::value; } - static KOKKOS_FORCEINLINE_FUNCTION int base() { return FLT_RADIX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { return eps() * static_cast(base()); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return FLT_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return FLT_MIN_EXP; } + static KOKKOS_FORCEINLINE_FUNCTION int t() { + return Kokkos::Experimental::digits::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { + return Kokkos::reduction_identity::prod(); + } + static KOKKOS_FORCEINLINE_FUNCTION int emin() { + return Kokkos::Experimental::min_exponent::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return FLT_MIN; // ??? // should be base^(emin-1) + return Kokkos::Experimental::norm_min::value; // ??? // should be + // base^(emin-1) + } + static KOKKOS_FORCEINLINE_FUNCTION int emax() { + return Kokkos::Experimental::max_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return FLT_MAX_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return FLT_MAX; // ??? // should be (base^emax)*(1-eps) + return Kokkos::Experimental::finite_max< + val_type>::value; // ??? // should be (base^emax)*(1-eps) } }; @@ -1252,14 +1177,14 @@ template class ArithTraits > { public: //! Kokkos internally replaces std::complex with Kokkos::complex. - typedef ::Kokkos::complex val_type; - typedef RealFloatType mag_type; + using val_type = ::Kokkos::complex; + using mag_type = RealFloatType; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; static constexpr bool has_infinity = true; static std::complex infinity() { @@ -1444,16 +1369,16 @@ class ArithTraits > { static mag_type epsilon() { return ArithTraits::epsilon(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef std::complex::halfPrecision> - halfPrecision; - typedef std::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = true; + using magnitudeType = mag_type; + using halfPrecision = + std::complex::halfPrecision>; + using doublePrecision = + std::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = true; static bool isnaninf(const std::complex& x) { return isNan(x) || isInf(x); } @@ -1486,148 +1411,119 @@ class ArithTraits > { template <> class ArithTraits { public: - typedef double val_type; - typedef val_type mag_type; + using val_type = double; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; } + static KOKKOS_FORCEINLINE_FUNCTION double infinity() { + return Kokkos::Experimental::infinity::value; + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); + return Kokkos::isinf(x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); + return Kokkos::isnan(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return ::fabs(x); + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0.0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1.0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return -DBL_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return DBL_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return 0.0; + return zero(); } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return ::pow(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif + return Kokkos::sqrt(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif + return Kokkos::cbrt(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return ::exp(x); + return Kokkos::exp(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return ::log(x); + return Kokkos::log(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return ::log10(x); + return Kokkos::log10(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); + return Kokkos::sin(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); + return Kokkos::cos(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif + return Kokkos::tan(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); + return Kokkos::sinh(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); + return Kokkos::cosh(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); + return Kokkos::tanh(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif + return Kokkos::asin(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif + return Kokkos::acos(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif + return Kokkos::atan(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + return Kokkos::Experimental::quiet_NaN::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; + } + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = float; #if defined(__CUDA_ARCH__) - return CUDART_NAN; - // return nan (); // this returns 0 ??? + using doublePrecision = + double; // CUDA doesn't support long double, unfortunately #elif defined(__HIP_DEVICE_COMPILE__) - return ::nan(""); + using doublePrecision = + double; // HIP does not support long double unfortunately #else - return std::numeric_limits::quiet_NaN(); + using doublePrecision = long double; #endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return DBL_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef float halfPrecision; -#if defined(__CUDA_ARCH__) - typedef double - doublePrecision; // CUDA doesn't support long double, unfortunately -#elif defined(__HIP_DEVICE_COMPILE__) - typedef double - doublePrecision; // HIP does not support long double unfortunately -#else - typedef long double doublePrecision; -#endif // __CUDA_ARCH__ - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { return abs(x); @@ -1641,23 +1537,32 @@ class ArithTraits { } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return DBL_MIN; // ??? + return Kokkos::Experimental::norm_min::value; // ??? } static KOKKOS_FORCEINLINE_FUNCTION int base() { - return FLT_RADIX; // same for float as for double + return Kokkos::Experimental::radix::value; // same for float as + // for double } static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { return eps() * static_cast(base()); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return DBL_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return DBL_MIN_EXP; } + static KOKKOS_FORCEINLINE_FUNCTION int t() { + return Kokkos::Experimental::digits::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FORCEINLINE_FUNCTION int emin() { + return Kokkos::Experimental::min_exponent::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return DBL_MIN; // ??? // should be base^(emin-1) + return Kokkos::Experimental::norm_min::value; // ??? // should be + // base^(emin-1) + } + static KOKKOS_FORCEINLINE_FUNCTION int emax() { + return Kokkos::Experimental::max_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return DBL_MAX_EXP; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return DBL_MAX; // ??? // should be (base^emax)*(1-eps) + return Kokkos::Experimental::finite_max< + val_type>::value; // ??? // should be (base^emax)*(1-eps) } }; @@ -1667,65 +1572,67 @@ class ArithTraits { template <> class ArithTraits { public: - typedef long double val_type; - typedef long double mag_type; + using val_type = long double; + using mag_type = long double; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; static long double infinity() { return HUGE_VALL; } - static bool isInf(const val_type& x) { - using std::isinf; - return isinf(x); + static bool isInf(const val_type& x) { return Kokkos::isinf(x); } + static bool isNan(const val_type& x) { return Kokkos::isnan(x); } + static mag_type abs(const val_type& x) { return Kokkos::abs(x); } + static val_type zero() { return static_cast(0.0); } + static val_type one() { return static_cast(1.0); } + static val_type min() { + return Kokkos::Experimental::finite_min::value; } - static bool isNan(const val_type& x) { - using std::isnan; - return isnan(x); + static val_type max() { + return Kokkos::Experimental::finite_max::value; } - static mag_type abs(const val_type& x) { return ::fabsl(x); } - static val_type zero() { return 0.0; } - static val_type one() { return 1.0; } - static val_type min() { return -LDBL_MAX; } - static val_type max() { return LDBL_MAX; } static mag_type real(const val_type& x) { return x; } static mag_type imag(const val_type&) { return zero(); } static val_type conj(const val_type& x) { return x; } static val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); + return Kokkos::pow(x, y); + } + static val_type sqrt(const val_type& x) { return Kokkos::sqrt(x); } + static val_type cbrt(const val_type& x) { return Kokkos::cbrtl(x); } + static val_type exp(const val_type& x) { return Kokkos::exp(x); } + static val_type log(const val_type& x) { return Kokkos::log(x); } + static val_type log10(const val_type& x) { return Kokkos::log10(x); } + static val_type sin(const val_type& x) { return Kokkos::sin(x); } + static val_type cos(const val_type& x) { return Kokkos::cos(x); } + static val_type tan(const val_type& x) { return Kokkos::tan(x); } + static val_type sinh(const val_type& x) { return Kokkos::sinh(x); } + static val_type cosh(const val_type& x) { return Kokkos::cosh(x); } + static val_type tanh(const val_type& x) { return Kokkos::tanh(x); } + static val_type asin(const val_type& x) { return Kokkos::asin(x); } + static val_type acos(const val_type& x) { return Kokkos::acos(x); } + static val_type atan(const val_type& x) { return Kokkos::atan(x); } + static val_type nan() { + return Kokkos::Experimental::quiet_NaN::value; + } + static mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; } - static val_type sqrt(const val_type& x) { return ::sqrt(x); } - static val_type cbrt(const val_type& x) { return ::cbrtl(x); } - static val_type exp(const val_type& x) { return ::exp(x); } - static val_type log(const val_type& x) { return ::log(x); } - static val_type log10(const val_type& x) { return ::log10(x); } - static val_type sin(const val_type& x) { return ::sin(x); } - static val_type cos(const val_type& x) { return ::cos(x); } - static val_type tan(const val_type& x) { return ::tan(x); } - static val_type sinh(const val_type& x) { return ::sinh(x); } - static val_type cosh(const val_type& x) { return ::cosh(x); } - static val_type tanh(const val_type& x) { return ::tanh(x); } - static val_type asin(const val_type& x) { return ::asin(x); } - static val_type acos(const val_type& x) { return ::acos(x); } - static val_type atan(const val_type& x) { return ::atan(x); } - static val_type nan() { return std::numeric_limits::quiet_NaN(); } - static mag_type epsilon() { return LDBL_EPSILON; } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; + using magnitudeType = mag_type; + using halfPrecision = double; // It might be appropriate to use QD's qd_real here. // For now, long double is the most you get. - typedef val_type doublePrecision; + using doublePrecision = val_type; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } static mag_type magnitude(const val_type& x) { return abs(x); } static val_type conjugate(const val_type& x) { return conj(x); } @@ -1733,18 +1640,24 @@ class ArithTraits { static val_type squareroot(const val_type& x) { return sqrt(x); } static mag_type eps() { return epsilon(); } static mag_type sfmin() { - return LDBL_MIN; // ??? - } - static int base() { - return FLT_RADIX; // same for float as for double or long double + return Kokkos::Experimental::norm_min::value; // ??? } + static int base() { return Kokkos::Experimental::radix::value; } static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return LDBL_MANT_DIG; } + static int t() { return Kokkos::Experimental::digits::value; } static mag_type rnd() { return one(); } - static int emin() { return LDBL_MIN_EXP; } - static mag_type rmin() { return LDBL_MIN; } - static int emax() { return LDBL_MAX_EXP; } - static mag_type rmax() { return LDBL_MAX; } + static int emin() { + return Kokkos::Experimental::min_exponent::value; + } + static mag_type rmin() { + return Kokkos::Experimental::norm_min::value; + } + static int emax() { + return Kokkos::Experimental::max_exponent::value; + } + static mag_type rmax() { + return Kokkos::Experimental::finite_max::value; + } }; // long double specialization #ifdef HAVE_KOKKOSKERNELS_QUADMATH @@ -1755,14 +1668,14 @@ class ArithTraits { template <> class ArithTraits<__float128> { public: - typedef __float128 val_type; - typedef val_type mag_type; + using val_type = __float128; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; static __float128 infinity() { return 1.0q / 0.0q; } @@ -1797,15 +1710,15 @@ class ArithTraits<__float128> { static mag_type epsilon() { return FLT128_EPSILON; } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; + using magnitudeType = mag_type; + using halfPrecision = double; // Unfortunately, we can't rely on a standard __float256 type. - typedef __float128 doublePrecision; + using doublePrecision = __float128; - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } static magnitudeType magnitude(const __float128 x) { return abs(x); } static __float128 conjugate(const __float128 x) { return conj(x); } @@ -1836,14 +1749,14 @@ class ArithTraits<__float128> { template <> class ArithTraits< ::Kokkos::complex > { public: - typedef ::Kokkos::complex val_type; - typedef float mag_type; + using val_type = ::Kokkos::complex; + using mag_type = float; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; static constexpr bool has_infinity = true; static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { @@ -1860,8 +1773,7 @@ class ArithTraits< ::Kokkos::complex > { ArithTraits::isNan(x.imag()); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return std::sqrt(::Kokkos::real(x) * ::Kokkos::real(x) + - ::Kokkos::imag(x) * ::Kokkos::imag(x)); + return Kokkos::abs(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return val_type(ArithTraits::zero(), @@ -1888,6 +1800,8 @@ class ArithTraits< ::Kokkos::complex > { static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return ::Kokkos::conj(x); } + // Note lbv 05-18-2022: we could just use the function defined in + // Kokkos_Complex.hpp and enable this feature // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const // val_type y) { // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); @@ -1998,15 +1912,15 @@ class ArithTraits< ::Kokkos::complex > { } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = + using magnitudeType = mag_type; + using halfPrecision = ::Kokkos::complex::halfPrecision>; + using doublePrecision = + ::Kokkos::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = ArithTraits::hasMachineParameters; static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { @@ -2052,14 +1966,14 @@ class ArithTraits< ::Kokkos::complex > { template <> class ArithTraits< ::Kokkos::complex > { public: - typedef ::Kokkos::complex val_type; - typedef double mag_type; + using val_type = ::Kokkos::complex; + using mag_type = double; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; static constexpr bool has_infinity = true; static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { @@ -2214,15 +2128,15 @@ class ArithTraits< ::Kokkos::complex > { } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = + using magnitudeType = mag_type; + using halfPrecision = ::Kokkos::complex::halfPrecision>; + using doublePrecision = + ::Kokkos::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = ArithTraits::hasMachineParameters; static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { @@ -2268,21 +2182,23 @@ class ArithTraits< ::Kokkos::complex > { template <> class ArithTraits { public: - typedef char val_type; - typedef val_type mag_type; + using val_type = char; + using mag_type = val_type; - static const bool is_specialized = true; + static constexpr bool is_specialized = true; // The C(++) standard does not require that char be signed. In // fact, signed char, unsigned char, and char are distinct types. // We can use std::numeric_limits here because it's a const bool, // not a class method. - static const bool is_signed = std::numeric_limits::is_signed; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_signed = std::numeric_limits::is_signed; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2292,26 +2208,32 @@ class ArithTraits { } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { // This avoids warnings based on whether char is signed or unsigned - return integer_abs::abs(x); + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return CHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return CHAR_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - if (is_signed) { - return intPowSigned(x, y); - } else { - return intPowUnsigned(x, y); - } + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { // C++11 defines std::sqrt for integer arguments. However, we @@ -2332,31 +2254,19 @@ class ArithTraits { // some reasonable value (like 0), though this might be more // expensive than the absolute value interpreted using the ternary // operator. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); + return static_cast(Kokkos::exp(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -2388,14 +2298,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -2414,17 +2324,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef signed char val_type; - typedef val_type mag_type; + using val_type = signed char; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2433,49 +2345,47 @@ class ArithTraits { return false; } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return SCHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SCHAR_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowSigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); + return static_cast(Kokkos::exp(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -2507,14 +2417,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -2533,17 +2443,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef unsigned char val_type; - typedef val_type mag_type; + using val_type = unsigned char; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = false; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2554,50 +2466,45 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return x; // it's unsigned, so it's positive } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UCHAR_MAX; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowUnsigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); + return static_cast(Kokkos::exp(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); + return static_cast(Kokkos::log(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); + return static_cast(Kokkos::log10(x)); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -2629,14 +2536,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -2655,17 +2562,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef short val_type; - typedef val_type mag_type; + using val_type = short; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2674,63 +2583,48 @@ class ArithTraits { return false; } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like this work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return SHRT_MIN; + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SHRT_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowSigned(x, y); + return Kokkos::pow(x, y); } //! Integer square root returns a lower bound. static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); + return static_cast(Kokkos::exp(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -2763,19 +2657,19 @@ class ArithTraits { // short doesn't implement a NaN value, but we can still have it // return some "flag" value that can help users find use of // uninitialized data. - return static_cast(-1); + return -one(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -2794,17 +2688,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef unsigned short val_type; - typedef val_type mag_type; + using val_type = unsigned short; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = false; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2815,50 +2711,48 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return x; // it's unsigned, so it's positive } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return USHRT_MAX; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowUnsigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { // This will result in no loss of accuracy, though it might be // more expensive than it should, if we were clever about using // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); + return static_cast(Kokkos::exp(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); + return static_cast(Kokkos::log(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); + return static_cast(Kokkos::log10(x)); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -2896,14 +2790,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -2922,17 +2816,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef int val_type; - typedef val_type mag_type; + using val_type = int; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -2941,62 +2837,47 @@ class ArithTraits { return false; } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like INT_MIN work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return INT_MIN; + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return INT_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowSigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); + return static_cast(Kokkos::exp(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3029,19 +2910,19 @@ class ArithTraits { // int doesn't implement a NaN value, but we can still have it // return some "flag" value that can help users find use of // uninitialized data. - return -1; + return -one(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -3060,17 +2941,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef unsigned int val_type; - typedef val_type mag_type; + using val_type = unsigned int; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = false; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -3081,50 +2964,45 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return x; // it's unsigned, so it's positive } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UINT_MAX; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowUnsigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); + return static_cast(Kokkos::exp(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); + return static_cast(Kokkos::log(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); + return static_cast(Kokkos::log10(x)); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3162,14 +3040,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -3188,17 +3066,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef long val_type; - typedef val_type mag_type; + using val_type = long; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -3209,35 +3089,39 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return x >= 0 ? x : -x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LONG_MAX; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowSigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::abs; - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(abs(x)))); -#else - return static_cast(sqrt(static_cast(abs(x)))); -#endif + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3270,19 +3154,19 @@ class ArithTraits { // long doesn't implement a NaN value, but we can still have it // return some "flag" value that can help users find use of // uninitialized data. - return -1; + return -one(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -3301,17 +3185,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef unsigned long val_type; - typedef val_type mag_type; + using val_type = unsigned long; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = false; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -3322,51 +3208,45 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULONG_MAX; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowUnsigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(x))); -#else - return static_cast(sqrt(static_cast(x))); -#endif + return static_cast(Kokkos::sqrt(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(::cbrtl(static_cast(x))); -#else - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); -#endif + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); + return static_cast(Kokkos::exp(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); + return static_cast(Kokkos::log(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); + return static_cast(Kokkos::log10(x)); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3404,14 +3284,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -3430,17 +3310,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef long long val_type; - typedef val_type mag_type; + using val_type = long long; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -3449,67 +3331,47 @@ class ArithTraits { return false; } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; + return Kokkos::abs(x); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LLONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LLONG_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowSigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::sqrt; - // IEEE 754 promises that long double has at least 64 significand - // bits, so we can use it to represent any signed or unsigned - // 64-bit integer type exactly. However, CUDA does not implement - // long double for device functions. - return static_cast(sqrt(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - // Casting from a 64-bit integer type to double does result in a - // loss of accuracy. However, it gives us a good first - // approximation. For very large numbers, we may lose some - // significand bits, but will always get within a factor of two - // (assuming correct rounding) of the exact double-precision - // number. We could then binary search between half the result - // and twice the result (assuming the latter is <= INT64_MAX, - // which it has to be, so we don't have to check) to ensure - // correctness. It actually should suffice to check numbers - // within 1 of the result. - return static_cast(sycl::sqrt(static_cast(abs(x)))); -#else - return static_cast(::sqrt(static_cast(abs(x)))); -#endif + return static_cast(Kokkos::sqrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::cbrtl; - return static_cast(cbrtl(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(abs(x)))); -#else - return static_cast(::cbrt(static_cast(abs(x)))); -#endif + return static_cast(Kokkos::cbrt(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); + return static_cast(Kokkos::exp(static_cast(abs(x)))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); + return static_cast(Kokkos::log(abs(x))); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); + return static_cast(Kokkos::log10(abs(x))); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3542,19 +3404,19 @@ class ArithTraits { // long long doesn't implement a NaN value, but we can still have // it return some "flag" value that can help users find use of // uninitialized data. - return -1; + return -one(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } @@ -3573,17 +3435,19 @@ class ArithTraits { template <> class ArithTraits { public: - typedef unsigned long long val_type; - typedef val_type mag_type; + using val_type = unsigned long long; + using mag_type = val_type; - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; + static constexpr bool is_specialized = true; + static constexpr bool is_signed = false; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return static_cast(0); + } static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { return false; @@ -3592,51 +3456,47 @@ class ArithTraits { return false; } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // unsigned integers are always nonnegative + return x; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + return static_cast(0); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + return static_cast(1); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + return Kokkos::Experimental::finite_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULLONG_MAX; } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } + static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + return zero(); + } static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { return x; } static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, const val_type y) { - return intPowUnsigned(x, y); + return Kokkos::pow(x, y); } static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::sqrt; - return static_cast(sqrt(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::sqrt(static_cast(x))); -#else - return static_cast(::sqrt(static_cast(x))); -#endif + return static_cast(Kokkos::sqrt(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(cbrtl(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(x))); -#else - return static_cast(::cbrt(static_cast(x))); -#endif + return static_cast(Kokkos::cbrt(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); + return static_cast(Kokkos::exp(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); + return static_cast(Kokkos::log(x)); } static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); + return static_cast(Kokkos::log10(x)); } // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { // return static_cast ( ::sin (static_cast (x))); @@ -3674,14 +3534,14 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } From 445ab15d3fcdef24ec11c3147b85e669f1fe12a4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 1 Apr 2022 07:38:05 -0600 Subject: [PATCH 143/261] cm_test_all_sandia: Set OMP_NUM_THREADS to 47 for armpl --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 1f8ee5ed51..d0fefe8f28 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -566,7 +566,7 @@ elif [ "$MACHINE" = "inouye" ]; then SKIP_HWLOC=True export OMP_PROC_BIND=close export OMP_PLACES=cores - export OMP_NUM_THREADS=48 + export OMP_NUM_THREADS=47 BASE_MODULE_LIST="cmake/3.17.0,/" From df7a71109430e51cfd4273c608b565ce574d74e5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 4 Apr 2022 13:51:23 -0600 Subject: [PATCH 144/261] scripts/cm_test_all_sandia: Fix bug in OMP settings --- scripts/cm_test_all_sandia | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index d0fefe8f28..16ef7dc9dc 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -564,9 +564,9 @@ elif [ "$MACHINE" = "inouye" ]; then MODULE_ENVIRONMENT="module purge" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - export OMP_PROC_BIND=close - export OMP_PLACES=cores - export OMP_NUM_THREADS=47 + export omp_proc_bind=close + export omp_places=cores + export omp_num_threads=47 BASE_MODULE_LIST="cmake/3.17.0,/" @@ -916,9 +916,9 @@ else exit 1 fi -export OMP_NUM_THREADS=8 -export OMP_PROC_BIND=spread -export OMP_PLACES=cores +export OMP_NUM_THREADS=${omp_num_threads:=8} +export OMP_PROC_BIND=${omp_proc_bind:=spread} +export OMP_PLACES=${omp_places:=cores} declare -i NUM_RESULTS_TO_KEEP=7 From a80a38da90fa1c616cc9a85204cd074b39ec2622 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 19 May 2022 14:59:39 -0600 Subject: [PATCH 145/261] ArithTraits: removing some unnecessary comments --- src/common/Kokkos_ArithTraits.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 672cb6cc68..e91252db6b 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -1138,7 +1138,7 @@ class ArithTraits { } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; // ??? + return Kokkos::Experimental::norm_min::value; } static KOKKOS_FORCEINLINE_FUNCTION int base() { return Kokkos::Experimental::radix::value; @@ -1156,15 +1156,14 @@ class ArithTraits { return Kokkos::Experimental::min_exponent::value; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::norm_min::value; // ??? // should be - // base^(emin-1) + return Kokkos::Experimental::norm_min::value; } static KOKKOS_FORCEINLINE_FUNCTION int emax() { return Kokkos::Experimental::max_exponent::value; } static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { return Kokkos::Experimental::finite_max< - val_type>::value; // ??? // should be (base^emax)*(1-eps) + val_type>::value; } }; From b4107b2db8136fb11dec5ca6bf91db8f7f28881f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 19 May 2022 18:06:59 -0600 Subject: [PATCH 146/261] ArithTraits: reorganizing the traits functions --- src/common/Kokkos_ArithTraits.hpp | 545 +++++++++++++++--------------- 1 file changed, 279 insertions(+), 266 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index e91252db6b..22c62a7fe8 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -1024,21 +1024,20 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION float infinity() { - return Kokkos::Experimental::infinity::value; - } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = float; // Should we switch to Kokkos::half_t + using doublePrecision = double; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "float"; } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return Kokkos::isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return Kokkos::isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return static_cast(0.0); } @@ -1051,6 +1050,52 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return Kokkos::Experimental::finite_max::value; } + static KOKKOS_FORCEINLINE_FUNCTION float infinity() { + return Kokkos::Experimental::infinity::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + return Kokkos::Experimental::quiet_NaN::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + return Kokkos::Experimental::norm_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION int base() { + return Kokkos::Experimental::radix::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + return eps() * static_cast(base()); + } + static KOKKOS_FORCEINLINE_FUNCTION int t() { + return Kokkos::Experimental::digits::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FORCEINLINE_FUNCTION int emin() { + return Kokkos::Experimental::min_exponent::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + return Kokkos::Experimental::norm_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION int emax() { + return Kokkos::Experimental::max_exponent::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + return Kokkos::Experimental::finite_max< + val_type>::value; + } + + // Math Functions + static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + return Kokkos::isinf(x); + } + static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + return Kokkos::isnan(x); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + return Kokkos::abs(x); + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } @@ -1106,20 +1151,8 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { return Kokkos::atan(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - // C++ doesn't have a standard "half-float" type. - using halfPrecision = float; - using doublePrecision = double; - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; + // Aliases static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } @@ -1129,42 +1162,10 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "float"; } static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - return Kokkos::Experimental::quiet_NaN::value; - } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return Kokkos::Experimental::radix::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return Kokkos::Experimental::digits::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::reduction_identity::prod(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::norm_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::finite_max< - val_type>::value; - } }; /// \brief Partial specialization for std::complex. @@ -1418,21 +1419,27 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION double infinity() { - return Kokkos::Experimental::infinity::value; - } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = float; +#if defined(__CUDA_ARCH__) + using doublePrecision = + double; // CUDA doesn't support long double, unfortunately +#elif defined(__HIP_DEVICE_COMPILE__) + using doublePrecision = + double; // HIP does not support long double unfortunately +#else + using doublePrecision = long double; +#endif // __CUDA_ARCH__ + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "double"; } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return Kokkos::isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return Kokkos::isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return static_cast(0.0); } @@ -1445,6 +1452,52 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return Kokkos::Experimental::finite_max::value; } + static KOKKOS_FORCEINLINE_FUNCTION double infinity() { + return Kokkos::Experimental::infinity::value; + } + static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + return Kokkos::Experimental::quiet_NaN::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + return Kokkos::Experimental::norm_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION int base() { + return Kokkos::Experimental::radix::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + return eps() * static_cast(base()); + } + static KOKKOS_FORCEINLINE_FUNCTION int t() { + return Kokkos::Experimental::digits::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FORCEINLINE_FUNCTION int emin() { + return Kokkos::Experimental::min_exponent::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + return Kokkos::Experimental::norm_min::value; + } + static KOKKOS_FORCEINLINE_FUNCTION int emax() { + return Kokkos::Experimental::max_exponent::value; + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + return Kokkos::Experimental::finite_max< + val_type>::value; + } + + // Math Functions + static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + return Kokkos::isinf(x); + } + static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + return Kokkos::isnan(x); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + return Kokkos::abs(x); + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x; } @@ -1500,69 +1553,21 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { return Kokkos::atan(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - return Kokkos::Experimental::quiet_NaN::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = float; -#if defined(__CUDA_ARCH__) - using doublePrecision = - double; // CUDA doesn't support long double, unfortunately -#elif defined(__HIP_DEVICE_COMPILE__) - using doublePrecision = - double; // HIP does not support long double unfortunately -#else - using doublePrecision = long double; -#endif // __CUDA_ARCH__ - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } + // Aliases + static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type& x) { + return isNan(x) || isInf(x); + } static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { return abs(x); } static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "double"; } static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return Kokkos::Experimental::radix::value; // same for float as - // for double - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return Kokkos::Experimental::digits::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::norm_min::value; // ??? // should be - // base^(emin-1) - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::finite_max< - val_type>::value; // ??? // should be (base^emax)*(1-eps) - } }; // CUDA and HIP do not support long double in device functions, @@ -1579,13 +1584,22 @@ class ArithTraits { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; - static constexpr bool has_infinity = true; - static long double infinity() { return HUGE_VALL; } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = double; + // It might be appropriate to use QD's qd_real here. + // For now, long double is the most you get. + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "long double"; } - static bool isInf(const val_type& x) { return Kokkos::isinf(x); } - static bool isNan(const val_type& x) { return Kokkos::isnan(x); } - static mag_type abs(const val_type& x) { return Kokkos::abs(x); } static val_type zero() { return static_cast(0.0); } static val_type one() { return static_cast(1.0); } static val_type min() { @@ -1594,52 +1608,17 @@ class ArithTraits { static val_type max() { return Kokkos::Experimental::finite_max::value; } - static mag_type real(const val_type& x) { return x; } - static mag_type imag(const val_type&) { return zero(); } - static val_type conj(const val_type& x) { return x; } - static val_type pow(const val_type& x, const val_type& y) { - return Kokkos::pow(x, y); + static long double infinity() { + return Kokkos::Experimental::infinity::value; } - static val_type sqrt(const val_type& x) { return Kokkos::sqrt(x); } - static val_type cbrt(const val_type& x) { return Kokkos::cbrtl(x); } - static val_type exp(const val_type& x) { return Kokkos::exp(x); } - static val_type log(const val_type& x) { return Kokkos::log(x); } - static val_type log10(const val_type& x) { return Kokkos::log10(x); } - static val_type sin(const val_type& x) { return Kokkos::sin(x); } - static val_type cos(const val_type& x) { return Kokkos::cos(x); } - static val_type tan(const val_type& x) { return Kokkos::tan(x); } - static val_type sinh(const val_type& x) { return Kokkos::sinh(x); } - static val_type cosh(const val_type& x) { return Kokkos::cosh(x); } - static val_type tanh(const val_type& x) { return Kokkos::tanh(x); } - static val_type asin(const val_type& x) { return Kokkos::asin(x); } - static val_type acos(const val_type& x) { return Kokkos::acos(x); } - static val_type atan(const val_type& x) { return Kokkos::atan(x); } static val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } static mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = double; - // It might be appropriate to use QD's qd_real here. - // For now, long double is the most you get. - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type& x) { return abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static std::string name() { return "long double"; } - static val_type squareroot(const val_type& x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } static mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; // ??? + return Kokkos::Experimental::norm_min::value; } static int base() { return Kokkos::Experimental::radix::value; } static mag_type prec() { return eps() * static_cast(base()); } @@ -1657,10 +1636,40 @@ class ArithTraits { static mag_type rmax() { return Kokkos::Experimental::finite_max::value; } + + // Math Functions + static bool isInf(const val_type& x) { return Kokkos::isinf(x); } + static bool isNan(const val_type& x) { return Kokkos::isnan(x); } + static mag_type abs(const val_type& x) { return Kokkos::abs(x); } + static mag_type real(const val_type& x) { return x; } + static mag_type imag(const val_type&) { return zero(); } + static val_type conj(const val_type& x) { return x; } + static val_type pow(const val_type& x, const val_type& y) { + return Kokkos::pow(x, y); + } + static val_type sqrt(const val_type& x) { return Kokkos::sqrt(x); } + static val_type cbrt(const val_type& x) { return Kokkos::cbrtl(x); } + static val_type exp(const val_type& x) { return Kokkos::exp(x); } + static val_type log(const val_type& x) { return Kokkos::log(x); } + static val_type log10(const val_type& x) { return Kokkos::log10(x); } + static val_type sin(const val_type& x) { return Kokkos::sin(x); } + static val_type cos(const val_type& x) { return Kokkos::cos(x); } + static val_type tan(const val_type& x) { return Kokkos::tan(x); } + static val_type sinh(const val_type& x) { return Kokkos::sinh(x); } + static val_type cosh(const val_type& x) { return Kokkos::cosh(x); } + static val_type tanh(const val_type& x) { return Kokkos::tanh(x); } + static val_type asin(const val_type& x) { return Kokkos::asin(x); } + static val_type acos(const val_type& x) { return Kokkos::acos(x); } + static val_type atan(const val_type& x) { return Kokkos::atan(x); } + static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } + static mag_type magnitude(const val_type& x) { return abs(x); } + static val_type conjugate(const val_type& x) { return conj(x); } + static val_type squareroot(const val_type& x) { return sqrt(x); } + static mag_type eps() { return epsilon(); } }; // long double specialization -#ifdef HAVE_KOKKOSKERNELS_QUADMATH +#ifdef HAVE_KOKKOSKERNELS_QUADMATH // CUDA does not support __float128 in device functions, so none of // the class methods in this specialization are marked as device // functions. @@ -1675,17 +1684,46 @@ class ArithTraits<__float128> { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; - static __float128 infinity() { return 1.0q / 0.0q; } - static bool isInf(const __float128 x) { return isinfq(x); } - static bool isNan(const __float128 x) { return isnanq(x); } - static mag_type abs(const __float128 x) { return fabsq(x); } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = double; + // Unfortunately, we can't rely on a standard __float256 type. + using doublePrecision = __float128; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + static __float128 zero() { return 0.0; } static __float128 one() { return 1.0; } static __float128 min() { return FLT128_MIN; } static __float128 max() { return FLT128_MAX; } + static __float128 infinity() { return 1.0q / 0.0q; } + static __float128 nan() { return strtoflt128("NAN()", NULL); } + static mag_type epsilon() { return FLT128_EPSILON; } + static mag_type sfmin() { + return FLT128_MIN; // ??? + } + static int base() { return 2; } + static mag_type prec() { return eps() * static_cast(base()); } + static int t() { return FLT_MANT_DIG; } + static mag_type rnd() { return 1.0; } + static int emin() { return FLT128_MIN_EXP; } + static mag_type rmin() { + return FLT128_MIN; // ??? // should be base^(emin-1) + } + static int emax() { return FLT128_MAX_EXP; } + static mag_type rmax() { + return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) + } + + // Math Functions + static bool isInf(const __float128 x) { return isinfq(x); } + static bool isNan(const __float128 x) { return isnanq(x); } + static mag_type abs(const __float128 x) { return fabsq(x); } static mag_type real(const __float128 x) { return x; } static mag_type imag(const __float128 /* x */) { return 0.0; } static __float128 conj(const __float128 x) { return x; } @@ -1706,42 +1744,14 @@ class ArithTraits<__float128> { static __float128 asin(const __float128 x) { return asinq(x); } static __float128 acos(const __float128 x) { return acosq(x); } static __float128 atan(const __float128 x) { return atanq(x); } - static mag_type epsilon() { return FLT128_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = double; - // Unfortunately, we can't rely on a standard __float256 type. - using doublePrecision = __float128; - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; + //Aliases static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } static magnitudeType magnitude(const __float128 x) { return abs(x); } static __float128 conjugate(const __float128 x) { return conj(x); } static std::string name() { return "__float128"; } static __float128 squareroot(const __float128 x) { return sqrt(x); } - static __float128 nan() { - return strtoflt128("NAN()", NULL); // ??? - } static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return FLT128_MIN; // ??? - } - static int base() { return 2; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return FLT_MANT_DIG; } - static mag_type rnd() { return 1.0; } - static int emin() { return FLT128_MIN_EXP; } - static mag_type rmin() { - return FLT128_MIN; // ??? // should be base^(emin-1) - } - static int emax() { return FLT128_MAX_EXP; } - static mag_type rmax() { - return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) - } }; #endif // HAVE_KOKKOSKERNELS_QUADMATH @@ -1756,24 +1766,22 @@ class ArithTraits< ::Kokkos::complex > { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = true; - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = ::Kokkos::complex::halfPrecision>; + using doublePrecision = + ::Kokkos::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = + ArithTraits::hasMachineParameters; + + static std::string name() { return "Kokkos::complex"; } + static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return val_type(ArithTraits::zero(), ArithTraits::zero()); @@ -1790,6 +1798,56 @@ class ArithTraits< ::Kokkos::complex > { return val_type(ArithTraits::max(), ArithTraits::max()); // ??? } + static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + return val_type(ArithTraits::infinity(), + ArithTraits::infinity()); + } + static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + return val_type(ArithTraits::nan(), ArithTraits::nan()); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + return ArithTraits::epsilon(); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + return ArithTraits::sfmin(); + } + static KOKKOS_FORCEINLINE_FUNCTION int base() { + return ArithTraits::base(); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + return ArithTraits::prec(); + } + static KOKKOS_FORCEINLINE_FUNCTION int t() { + return ArithTraits::t(); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { + return ArithTraits::rnd(); + } + static KOKKOS_FORCEINLINE_FUNCTION int emin() { + return ArithTraits::emin(); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + return ArithTraits::rmin(); + } + static KOKKOS_FORCEINLINE_FUNCTION int emax() { + return ArithTraits::emax(); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + return ArithTraits::rmax(); + } + + // Math Functions + static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + return ArithTraits::isInf(x.real()) || + ArithTraits::isInf(x.imag()); + } + static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + return ArithTraits::isNan(x.real()) || + ArithTraits::isNan(x.imag()); + } + static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + return Kokkos::abs(x); + } static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { return x.real(); } @@ -1902,25 +1960,8 @@ class ArithTraits< ::Kokkos::complex > { // } // return r_val; // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? - return val_type(ArithTraits::nan(), ArithTraits::nan()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? - } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = ::Kokkos::complex::halfPrecision>; - using doublePrecision = - ::Kokkos::complex::doublePrecision>; - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = - ArithTraits::hasMachineParameters; + // Aliases static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { return abs(x); @@ -1928,38 +1969,10 @@ class ArithTraits< ::Kokkos::complex > { static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return ArithTraits::base(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return ArithTraits::t(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return ArithTraits::rnd(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return ArithTraits::emin(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return ArithTraits::rmin(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return ArithTraits::emax(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return ArithTraits::rmax(); + static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { + return sqrt (x); } + static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } }; template <> From bd26e61bfd9bc7c4bded2778cede501e39609498 Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Fri, 20 May 2022 09:39:47 -0600 Subject: [PATCH 147/261] Remove diagnostic message to stdout Remove the diagnostic message printed to stdout in Controls::getParameter when the requested parameter is not set. --- src/common/KokkosKernels_Controls.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/common/KokkosKernels_Controls.hpp b/src/common/KokkosKernels_Controls.hpp index a1a4fb59ea..2e1a96a7a6 100644 --- a/src/common/KokkosKernels_Controls.hpp +++ b/src/common/KokkosKernels_Controls.hpp @@ -92,8 +92,6 @@ class Controls { const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { - std::cout << "Parameter " << name - << " was not found in the list of parameters!" << std::endl; return orUnset; } else { return search->second; From a4ff12c9f432e8617675fdf3d6c217b6586edf19 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 21 May 2022 09:28:44 -0400 Subject: [PATCH 148/261] Update signing key in SYCL image --- scripts/docker/Dockerfile.sycl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 888a36d510..f5197ab7b3 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,6 +1,8 @@ ARG BASE=nvidia/cuda:10.2-devel FROM $BASE +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + RUN apt-get update && apt-get install -y \ bc \ wget \ From 0418af2b23e598e744ab20b630df1f0f17c11718 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Sat, 21 May 2022 09:31:22 -0400 Subject: [PATCH 149/261] Use KOKKOS_IMPL_DO_NOT_USE_PRINTF in Test_Sparse_spmv.hpp --- unit_test/sparse/Test_Sparse_spmv.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 6cc48c863b..5cb729f311 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -73,8 +73,9 @@ struct fSPMV { if (error > eps * max_val) { err++; - printf("expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, - AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); } } }; From 4634678901ac9bc5e53413fcd5e6a03fb21850ef Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 23 May 2022 09:12:12 -0600 Subject: [PATCH 150/261] ArithTraits: cleaning up floating point and complex traits --- src/common/Kokkos_ArithTraits.hpp | 1176 +++++++++++++---------------- 1 file changed, 542 insertions(+), 634 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 22c62a7fe8..cd681488dd 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -1038,374 +1038,133 @@ class ArithTraits { static std::string name() { return "float"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static val_type zero() { return static_cast(0.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static val_type one() { return static_cast(1.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static val_type min() { return Kokkos::Experimental::finite_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static val_type max() { return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION float infinity() { + static float infinity() { return Kokkos::Experimental::infinity::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + static val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + static mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + static mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static int base() { return Kokkos::Experimental::radix::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + static mag_type prec() { return eps() * static_cast(base()); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static int t() { return Kokkos::Experimental::digits::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static mag_type rnd() { return one(); } + static int emin() { return Kokkos::Experimental::min_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static mag_type rmin() { return Kokkos::Experimental::norm_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static int emax() { return Kokkos::Experimental::max_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static mag_type rmax() { return Kokkos::Experimental::finite_max< val_type>::value; } // Math Functions - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static bool isInf(const val_type x) { return Kokkos::isinf(x); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static bool isNan(const val_type x) { return Kokkos::isnan(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static mag_type abs(const val_type x) { return Kokkos::abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + static mag_type imag(const val_type) { return zero(); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static val_type conj(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { + static val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { + static val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { + static val_type exp(const val_type x) { return Kokkos::exp(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { + static val_type log(const val_type x) { return Kokkos::log(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { + static val_type log10(const val_type x) { return Kokkos::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static val_type sin(const val_type x) { return Kokkos::sin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static val_type cos(const val_type x) { return Kokkos::cos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static val_type tan(const val_type x) { return Kokkos::tan(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static val_type sinh(const val_type x) { return Kokkos::sinh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static val_type cosh(const val_type x) { return Kokkos::cosh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static val_type tanh(const val_type x) { return Kokkos::tanh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static val_type asin(const val_type x) { return Kokkos::asin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static val_type acos(const val_type x) { return Kokkos::acos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static val_type atan(const val_type x) { return Kokkos::atan(x); } // Aliases - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { + static bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { + static magnitudeType magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static val_type conjugate(const val_type x) { return conj(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { + static val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } -}; - -/// \brief Partial specialization for std::complex. -/// -/// The C++ Standard Library (with C++03 at least) only allows -/// std::complex for RealFloatType = float, double, or -/// long double. -template -class ArithTraits > { - public: - //! Kokkos internally replaces std::complex with Kokkos::complex. - using val_type = ::Kokkos::complex; - using mag_type = RealFloatType; - - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool is_complex = true; - - static constexpr bool has_infinity = true; - static std::complex infinity() { - return std::complex(ArithTraits::infinity(), - ArithTraits::infinity()); - } - -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isInf(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(real(x)) || isinf(imag(x)); - } - template <> - static bool isInf(const std::complex& x) { - Kokkos::abort("isInf not available for std::complex!\n"); - return true; - } -#else - static bool isInf(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#endif - return isinf(real(x)) || isinf(imag(x)); - } -#endif -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isNan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(real(x)) || isnan(imag(x)); - } - template <> - static bool isNan(const std::complex& x) { - Kokkos::abort("isNan not available for std::complex!\n"); - return true; - } -#else - static bool isNan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#endif - return isnan(real(x)) || isnan(imag(x)); - } -#endif - static mag_type abs(const std::complex& x) { - return std::abs(x); - } - static std::complex zero() { - return std::complex(ArithTraits::zero(), - ArithTraits::zero()); - } - static std::complex one() { - return std::complex(ArithTraits::one(), - ArithTraits::zero()); - } - static std::complex min() { - return std::complex(ArithTraits::min(), - ArithTraits::zero()); - } - static std::complex max() { - return std::complex(ArithTraits::max(), - ArithTraits::zero()); - } - static mag_type real(const std::complex& x) { - return std::real(x); - } - static mag_type imag(const std::complex& x) { - return std::imag(x); - } - static std::complex conj( - const std::complex& x) { - return std::conj(x); - } - static std::complex pow(const std::complex& x, - const std::complex& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == one()) { - return x; - } else if (y == one() + one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex pow(const std::complex& x, - const RealFloatType& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == ArithTraits::one()) { - return x; - } else if (y == ArithTraits::one() + - ArithTraits::one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex sqrt( - const std::complex& x) { - return std::sqrt(x); - } - static std::complex cbrt( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static std::complex exp(const std::complex& x) { - return std::exp(x); - } - static std::complex log(const std::complex& x) { - return std::log(x); - } - static std::complex log10( - const std::complex& x) { - return std::log10(x); - } - static std::complex sin(const std::complex& x) { - return std::sin(x); - } - static std::complex cos(const std::complex& x) { - return std::cos(x); - } - static std::complex tan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static std::complex sinh( - const std::complex& x) { - return std::sinh(x); - } - static std::complex cosh( - const std::complex& x) { - return std::cosh(x); - } - static std::complex tanh( - const std::complex& x) { - return std::tanh(x); - } - static std::complex asin( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static std::complex acos( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static std::complex atan( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::atan; -#else - using std::atan; -#endif - return atan(x); - } - static std::complex nan() { - const mag_type mag_nan = ArithTraits::nan(); - return std::complex(mag_nan, mag_nan); - } - static mag_type epsilon() { return ArithTraits::epsilon(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = - std::complex::halfPrecision>; - using doublePrecision = - std::complex::doublePrecision>; - - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = true; - static bool isnaninf(const std::complex& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const std::complex& x) { - return abs(x); - } - static std::complex conjugate( - const std::complex& x) { - return conj(x); - } - static std::string name() { - return std::string("std::complex<") + ArithTraits::name() + ">"; - } - static std::complex squareroot( - const std::complex& x) { - return sqrt(x); - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return ArithTraits::sfmin(); } - static int base() { return ArithTraits::base(); } - static mag_type prec() { return ArithTraits::prec(); } - static int t() { return ArithTraits::t(); } - static mag_type rnd() { return ArithTraits::one(); } - static int emin() { return ArithTraits::emin(); } - static mag_type rmin() { return ArithTraits::rmin(); } - static int emax() { return ArithTraits::emax(); } - static mag_type rmax() { return ArithTraits::rmax(); } + static mag_type eps() { return epsilon(); } }; template <> @@ -1440,134 +1199,133 @@ class ArithTraits { static std::string name() { return "double"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static val_type zero() { return static_cast(0.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static val_type one() { return static_cast(1.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static val_type min() { return Kokkos::Experimental::finite_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static val_type max() { return Kokkos::Experimental::finite_max::value; } - static KOKKOS_FORCEINLINE_FUNCTION double infinity() { + static double infinity() { return Kokkos::Experimental::infinity::value; } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + static val_type nan() { return Kokkos::Experimental::quiet_NaN::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + static mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + static mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static int base() { return Kokkos::Experimental::radix::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + static mag_type prec() { return eps() * static_cast(base()); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static int t() { return Kokkos::Experimental::digits::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static mag_type rnd() { return one(); } + static int emin() { return Kokkos::Experimental::min_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static mag_type rmin() { return Kokkos::Experimental::norm_min::value; } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static int emax() { return Kokkos::Experimental::max_exponent::value; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static mag_type rmax() { return Kokkos::Experimental::finite_max< val_type>::value; } // Math Functions - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static bool isInf(const val_type x) { return Kokkos::isinf(x); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static bool isNan(const val_type x) { return Kokkos::isnan(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static mag_type abs(const val_type x) { return Kokkos::abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + static mag_type imag(const val_type) { return zero(); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static val_type conj(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { + static val_type pow(const val_type x, const val_type y) { return Kokkos::pow(x, y); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { + static val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { + static val_type exp(const val_type x) { return Kokkos::exp(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { + static val_type log(const val_type x) { return Kokkos::log(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { + static val_type log10(const val_type x) { return Kokkos::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static val_type sin(const val_type x) { return Kokkos::sin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static val_type cos(const val_type x) { return Kokkos::cos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static val_type tan(const val_type x) { return Kokkos::tan(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static val_type sinh(const val_type x) { return Kokkos::sinh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static val_type cosh(const val_type x) { return Kokkos::cosh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static val_type tanh(const val_type x) { return Kokkos::tanh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static val_type asin(const val_type x) { return Kokkos::asin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static val_type acos(const val_type x) { return Kokkos::acos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static val_type atan(const val_type x) { return Kokkos::atan(x); } // Aliases - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type& x) { + static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { + static mag_type magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static val_type conjugate(const val_type x) { return conj(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { + static val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } + static mag_type eps() { return epsilon(); } }; // CUDA and HIP do not support long double in device functions, @@ -1688,72 +1446,312 @@ class ArithTraits<__float128> { // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; - using halfPrecision = double; - // Unfortunately, we can't rely on a standard __float256 type. - using doublePrecision = __float128; + using halfPrecision = double; + // Unfortunately, we can't rely on a standard __float256 type. + using doublePrecision = __float128; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static __float128 zero() { return 0.0; } + static __float128 one() { return 1.0; } + static __float128 min() { return FLT128_MIN; } + static __float128 max() { return FLT128_MAX; } + static __float128 infinity() { return 1.0q / 0.0q; } + static __float128 nan() { return strtoflt128("NAN()", NULL); } + static mag_type epsilon() { return FLT128_EPSILON; } + static mag_type sfmin() { + return FLT128_MIN; // ??? + } + static int base() { return 2; } + static mag_type prec() { return eps() * static_cast(base()); } + static int t() { return FLT_MANT_DIG; } + static mag_type rnd() { return 1.0; } + static int emin() { return FLT128_MIN_EXP; } + static mag_type rmin() { + return FLT128_MIN; // ??? // should be base^(emin-1) + } + static int emax() { return FLT128_MAX_EXP; } + static mag_type rmax() { + return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) + } + + // Math Functions + static bool isInf(const __float128 x) { return isinfq(x); } + static bool isNan(const __float128 x) { return isnanq(x); } + static mag_type abs(const __float128 x) { return fabsq(x); } + static mag_type real(const __float128 x) { return x; } + static mag_type imag(const __float128 /* x */) { return 0.0; } + static __float128 conj(const __float128 x) { return x; } + static __float128 pow(const __float128 x, const __float128 y) { + return powq(x, y); + } + static __float128 sqrt(const __float128 x) { return sqrtq(x); } + static __float128 cbrt(const __float128 x) { return cbrtq(x); } + static __float128 exp(const __float128 x) { return exp(x); } + static __float128 log(const __float128 x) { return logq(x); } + static __float128 log10(const __float128 x) { return log10q(x); } + static __float128 sin(const __float128 x) { return sinq(x); } + static __float128 cos(const __float128 x) { return cosq(x); } + static __float128 tan(const __float128 x) { return tanq(x); } + static __float128 sinh(const __float128 x) { return sinhq(x); } + static __float128 cosh(const __float128 x) { return coshq(x); } + static __float128 tanh(const __float128 x) { return tanhq(x); } + static __float128 asin(const __float128 x) { return asinq(x); } + static __float128 acos(const __float128 x) { return acosq(x); } + static __float128 atan(const __float128 x) { return atanq(x); } + + //Aliases + static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } + static magnitudeType magnitude(const __float128 x) { return abs(x); } + static __float128 conjugate(const __float128 x) { return conj(x); } + static std::string name() { return "__float128"; } + static __float128 squareroot(const __float128 x) { return sqrt(x); } + static mag_type eps() { return epsilon(); } +}; // __float128 specialization +#endif // HAVE_KOKKOSKERNELS_QUADMATH + +/// \brief Partial specialization for std::complex. +/// +/// The C++ Standard Library (with C++03 at least) only allows +/// std::complex for RealFloatType = float, double, or +/// long double. +template +class ArithTraits > { + public: + //! Kokkos internally replaces std::complex with Kokkos::complex. + using val_type = ::Kokkos::complex; + using mag_type = RealFloatType; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; + + static constexpr bool has_infinity = true; + static std::complex infinity() { + return std::complex(ArithTraits::infinity(), + ArithTraits::infinity()); + } + +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isinf; +#endif + return isinf(real(x)) || isinf(imag(x)); + } + template <> + static bool isInf(const std::complex& x) { + Kokkos::abort("isInf not available for std::complex!\n"); + return true; + } +#else + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#endif + return isinf(real(x)) || isinf(imag(x)); + } +#endif +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isnan; +#endif + return isnan(real(x)) || isnan(imag(x)); + } + template <> + static bool isNan(const std::complex& x) { + Kokkos::abort("isNan not available for std::complex!\n"); + return true; + } +#else + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#endif + return isnan(real(x)) || isnan(imag(x)); + } +#endif + static mag_type abs(const std::complex& x) { + return std::abs(x); + } + static std::complex zero() { + return std::complex(ArithTraits::zero(), + ArithTraits::zero()); + } + static std::complex one() { + return std::complex(ArithTraits::one(), + ArithTraits::zero()); + } + static std::complex min() { + return std::complex(ArithTraits::min(), + ArithTraits::zero()); + } + static std::complex max() { + return std::complex(ArithTraits::max(), + ArithTraits::zero()); + } + static mag_type real(const std::complex& x) { + return std::real(x); + } + static mag_type imag(const std::complex& x) { + return std::imag(x); + } + static std::complex conj( + const std::complex& x) { + return std::conj(x); + } + static std::complex pow(const std::complex& x, + const std::complex& y) { + // Fix for some weird gcc 4.2.1 inaccuracy. + if (y == one()) { + return x; + } else if (y == one() + one()) { + return x * x; + } else { + return std::pow(x, y); + } + } + static std::complex pow(const std::complex& x, + const RealFloatType& y) { + // Fix for some weird gcc 4.2.1 inaccuracy. + if (y == ArithTraits::one()) { + return x; + } else if (y == ArithTraits::one() + + ArithTraits::one()) { + return x * x; + } else { + return std::pow(x, y); + } + } + static std::complex sqrt( + const std::complex& x) { + return std::sqrt(x); + } + static std::complex cbrt( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif + } + static std::complex exp(const std::complex& x) { + return std::exp(x); + } + static std::complex log(const std::complex& x) { + return std::log(x); + } + static std::complex log10( + const std::complex& x) { + return std::log10(x); + } + static std::complex sin(const std::complex& x) { + return std::sin(x); + } + static std::complex cos(const std::complex& x) { + return std::cos(x); + } + static std::complex tan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::tan(x); +#else + return std::tan(x); +#endif + } + static std::complex sinh( + const std::complex& x) { + return std::sinh(x); + } + static std::complex cosh( + const std::complex& x) { + return std::cosh(x); + } + static std::complex tanh( + const std::complex& x) { + return std::tanh(x); + } + static std::complex asin( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::asin(x); +#else + return ::asin(x); +#endif + } + static std::complex acos( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::acos(x); +#else + return ::acos(x); +#endif + } + static std::complex atan( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::atan; +#else + using std::atan; +#endif + return atan(x); + } + static std::complex nan() { + const mag_type mag_nan = ArithTraits::nan(); + return std::complex(mag_nan, mag_nan); + } + static mag_type epsilon() { return ArithTraits::epsilon(); } + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = + std::complex::halfPrecision>; + using doublePrecision = + std::complex::doublePrecision>; - static constexpr bool isComplex = false; + static constexpr bool isComplex = true; static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; + static constexpr bool isComparable = false; static constexpr bool hasMachineParameters = true; - - static __float128 zero() { return 0.0; } - static __float128 one() { return 1.0; } - static __float128 min() { return FLT128_MIN; } - static __float128 max() { return FLT128_MAX; } - static __float128 infinity() { return 1.0q / 0.0q; } - static __float128 nan() { return strtoflt128("NAN()", NULL); } - static mag_type epsilon() { return FLT128_EPSILON; } - static mag_type sfmin() { - return FLT128_MIN; // ??? + static bool isnaninf(const std::complex& x) { + return isNan(x) || isInf(x); } - static int base() { return 2; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return FLT_MANT_DIG; } - static mag_type rnd() { return 1.0; } - static int emin() { return FLT128_MIN_EXP; } - static mag_type rmin() { - return FLT128_MIN; // ??? // should be base^(emin-1) + static mag_type magnitude(const std::complex& x) { + return abs(x); } - static int emax() { return FLT128_MAX_EXP; } - static mag_type rmax() { - return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) + static std::complex conjugate( + const std::complex& x) { + return conj(x); } - - // Math Functions - static bool isInf(const __float128 x) { return isinfq(x); } - static bool isNan(const __float128 x) { return isnanq(x); } - static mag_type abs(const __float128 x) { return fabsq(x); } - static mag_type real(const __float128 x) { return x; } - static mag_type imag(const __float128 /* x */) { return 0.0; } - static __float128 conj(const __float128 x) { return x; } - static __float128 pow(const __float128 x, const __float128 y) { - return powq(x, y); + static std::string name() { + return std::string("std::complex<") + ArithTraits::name() + ">"; + } + static std::complex squareroot( + const std::complex& x) { + return sqrt(x); } - static __float128 sqrt(const __float128 x) { return sqrtq(x); } - static __float128 cbrt(const __float128 x) { return cbrtq(x); } - static __float128 exp(const __float128 x) { return exp(x); } - static __float128 log(const __float128 x) { return logq(x); } - static __float128 log10(const __float128 x) { return log10q(x); } - static __float128 sin(const __float128 x) { return sinq(x); } - static __float128 cos(const __float128 x) { return cosq(x); } - static __float128 tan(const __float128 x) { return tanq(x); } - static __float128 sinh(const __float128 x) { return sinhq(x); } - static __float128 cosh(const __float128 x) { return coshq(x); } - static __float128 tanh(const __float128 x) { return tanhq(x); } - static __float128 asin(const __float128 x) { return asinq(x); } - static __float128 acos(const __float128 x) { return acosq(x); } - static __float128 atan(const __float128 x) { return atanq(x); } - - //Aliases - static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } - static magnitudeType magnitude(const __float128 x) { return abs(x); } - static __float128 conjugate(const __float128 x) { return conj(x); } - static std::string name() { return "__float128"; } - static __float128 squareroot(const __float128 x) { return sqrt(x); } static mag_type eps() { return epsilon(); } + static mag_type sfmin() { return ArithTraits::sfmin(); } + static int base() { return ArithTraits::base(); } + static mag_type prec() { return ArithTraits::prec(); } + static int t() { return ArithTraits::t(); } + static mag_type rnd() { return ArithTraits::one(); } + static int emin() { return ArithTraits::emin(); } + static mag_type rmin() { return ArithTraits::rmin(); } + static int emax() { return ArithTraits::emax(); } + static mag_type rmax() { return ArithTraits::rmax(); } }; -#endif // HAVE_KOKKOSKERNELS_QUADMATH template <> class ArithTraits< ::Kokkos::complex > { @@ -1782,197 +1780,152 @@ class ArithTraits< ::Kokkos::complex > { static std::string name() { return "Kokkos::complex"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static val_type zero() { return val_type(ArithTraits::zero(), ArithTraits::zero()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static val_type one() { return val_type(ArithTraits::one(), ArithTraits::zero()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static val_type min() { return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? + ArithTraits::min()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static val_type max() { return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? + ArithTraits::max()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + static val_type infinity() { return val_type(ArithTraits::infinity(), ArithTraits::infinity()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + static val_type nan() { return val_type(ArithTraits::nan(), ArithTraits::nan()); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + static mag_type epsilon() { return ArithTraits::epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + static mag_type sfmin() { return ArithTraits::sfmin(); } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static int base() { return ArithTraits::base(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { + static mag_type prec() { return ArithTraits::prec(); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static int t() { return ArithTraits::t(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { + static mag_type rnd() { return ArithTraits::rnd(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static int emin() { return ArithTraits::emin(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static mag_type rmin() { return ArithTraits::rmin(); } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static int emax() { return ArithTraits::emax(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static mag_type rmax() { return ArithTraits::rmax(); } // Math Functions - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static bool isInf(const val_type x) { return ArithTraits::isInf(x.real()) || ArithTraits::isInf(x.imag()); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static bool isNan(const val_type x) { return ArithTraits::isNan(x.real()) || ArithTraits::isNan(x.imag()); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static mag_type abs(const val_type x) { return Kokkos::abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static mag_type real(const val_type x) { return x.real(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { + static mag_type imag(const val_type x) { return x.imag(); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static val_type conj(const val_type x) { return ::Kokkos::conj(x); } - // Note lbv 05-18-2022: we could just use the function defined in - // Kokkos_Complex.hpp and enable this feature - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static val_type pow (const val_type x, const + val_type y) { + return Kokkos::pow(x, y); + } + static val_type pow (const val_type x, const + mag_type y) { + return Kokkos::pow(x, y); + } + static val_type pow (const mag_type x, const + val_type y) { + return Kokkos::pow(x, y); + } + static val_type sqrt(const val_type x) { return ::Kokkos::sqrt(x); } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { + // static val_type cbrt (const val_type x) { // const mag_type r = ::Kokkos::abs(x); // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); // const mag_type re = r* ::cos(phi); // const mag_type im = r* ::sin(phi); // return val_type(re,im); // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } + static val_type exp (const val_type x) { + return Kokkos::exp(x); + } + static val_type log (const val_type x) { + return Kokkos::log(x); + } + static val_type log10 (const val_type x) { + return Kokkos::log10(x); + } + static val_type sin (const val_type x) { + return Kokkos::sin(x); + } + static val_type cos (const val_type x) { + return Kokkos::cos(x); + } + static val_type tan (const val_type x) { + return Kokkos::tan(x); + } + static val_type sinh (const val_type x) { + return Kokkos::cosh(x); + } + static val_type cosh (const val_type x) { + return Kokkos::cosh(x); + } + static val_type tanh (const val_type x) { + return Kokkos::tanh(x); + } + static val_type asin (const val_type x) { + return Kokkos::asin(x); + } + static val_type acos (const val_type x) { + return Kokkos::acos(x); + } + static val_type atan (const val_type x) { + return Kokkos::atan(x); + } // Aliases static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { + static mag_type magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static val_type conjugate(const val_type x) { return conj(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { + static val_type squareroot (const val_type x) { return sqrt (x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } + static mag_type eps() { return epsilon(); } }; template <> @@ -1988,155 +1941,110 @@ class ArithTraits< ::Kokkos::complex > { static constexpr bool is_complex = true; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + static val_type infinity() { return val_type(ArithTraits::infinity(), ArithTraits::infinity()); } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static bool isInf(const val_type x) { return ArithTraits::isInf(x.real()) || ArithTraits::isInf(x.imag()); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static bool isNan(const val_type x) { return ArithTraits::isNan(x.real()) || ArithTraits::isNan(x.imag()); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static mag_type abs(const val_type x) { return ::Kokkos::abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static val_type zero() { return val_type(ArithTraits::zero(), ArithTraits::zero()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static val_type one() { return val_type(ArithTraits::one(), ArithTraits::zero()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static val_type min() { return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? + ArithTraits::min()); } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static val_type max() { return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? + ArithTraits::max()); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static mag_type real(const val_type x) { return x.real(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { + static mag_type imag(const val_type x) { return x.imag(); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static val_type conj(const val_type x) { return ::Kokkos::conj(x); } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static val_type pow (const val_type x, const + val_type y) { + return Kokkos::pow(x, y); + } + static val_type pow (const val_type x, const + mag_type y) { + return Kokkos::pow(x, y); + } + static val_type pow (const mag_type x, const + val_type y) { + return Kokkos::pow(x, y); + } + static val_type sqrt(const val_type x) { return ::Kokkos::sqrt(x); } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { + // static val_type cbrt (const val_type x) { // const mag_type r = ::Kokkos::abs(x); // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); // const mag_type re = r* ::cos(phi); // const mag_type im = r* ::sin(phi); // return val_type(re,im); // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? + static val_type exp (const val_type x) { + return Kokkos::exp(x); + } + static val_type log (const val_type x) { + return Kokkos::log(x); + } + static val_type log10 (const val_type x) { + return Kokkos::log10(x); + } + static val_type sin (const val_type x) { + return Kokkos::sin(x); + } + static val_type cos (const val_type x) { + return Kokkos::cos(x); + } + static val_type tan (const val_type x) { + return Kokkos::tan(x); + } + static val_type sinh (const val_type x) { + return Kokkos::sinh(x); + } + static val_type cosh (const val_type x) { + return Kokkos::cosh(x); + } + static val_type tanh (const val_type x) { + return Kokkos::tanh(x); + } + static val_type asin (const val_type x) { + return Kokkos::asin(x); + } + static val_type acos (const val_type x) { + return Kokkos::acos(x); + } + static val_type atan (const val_type x) { + return Kokkos::atan(x); + } + static val_type nan() { return val_type(ArithTraits::nan(), ArithTraits::nan()); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? + static mag_type epsilon() { + return ArithTraits::epsilon(); } // Backwards compatibility with Teuchos::ScalarTraits. @@ -2151,42 +2059,42 @@ class ArithTraits< ::Kokkos::complex > { static constexpr bool hasMachineParameters = ArithTraits::hasMachineParameters; static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { + static mag_type magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static val_type conjugate(const val_type x) { return conj(x); } static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? + static val_type squareroot (const val_type x) { + return sqrt (x); } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static mag_type eps() { return epsilon(); } + static mag_type sfmin() { + return ArithTraits::sfmin(); + } + static int base() { return ArithTraits::base(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? + static mag_type prec() { + return ArithTraits::prec(); } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static int t() { return ArithTraits::t(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { + static mag_type rnd() { return ArithTraits::rnd(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static int emin() { return ArithTraits::emin(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static mag_type rmin() { return ArithTraits::rmin(); } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static int emax() { return ArithTraits::emax(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static mag_type rmax() { return ArithTraits::rmax(); } }; From 30812b1e5d4c4f82cde4aa26a5085e0c1257b1df Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 24 May 2022 16:01:56 -0600 Subject: [PATCH 151/261] example/half: Add xpy.cpp - Add vector addition Kokkos code to compare relative error and runtime across float, half_t, and bhalf_t. - Add script for reproducing workshop paper results. --- example/CMakeLists.txt | 1 + example/half/CMakeLists.txt | 17 ++ .../half/us-rse-escience-2022-reproducer.sh | 195 ++++++++++++++++++ example/half/xpy.cpp | 135 ++++++++++++ 4 files changed, 348 insertions(+) create mode 100644 example/half/CMakeLists.txt create mode 100755 example/half/us-rse-escience-2022-reproducer.sh create mode 100644 example/half/xpy.cpp diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 6ef9a91e55..45fb3a41e1 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -8,3 +8,4 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) ADD_SUBDIRECTORY(wiki) ADD_SUBDIRECTORY(gmres) ADD_SUBDIRECTORY(batched_solve) +ADD_SUBDIRECTORY(half) diff --git a/example/half/CMakeLists.txt b/example/half/CMakeLists.txt new file mode 100644 index 0000000000..6516fdc8b7 --- /dev/null +++ b/example/half/CMakeLists.txt @@ -0,0 +1,17 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_ADD_EXECUTABLE( + xpy + SOURCES xpy.cpp + ) + +#KOKKOSKERNELS_ADD_EXECUTABLE( +# spmv +# SOURCES spmv.cpp +# ) +# +#KOKKOSKERNELS_ADD_EXECUTABLE( +# dot +# SOURCES dot.cpp +# ) \ No newline at end of file diff --git a/example/half/us-rse-escience-2022-reproducer.sh b/example/half/us-rse-escience-2022-reproducer.sh new file mode 100755 index 0000000000..ef7ffb0eef --- /dev/null +++ b/example/half/us-rse-escience-2022-reproducer.sh @@ -0,0 +1,195 @@ +#!/bin/bash +################################################################################ +# @Brief: On the specified arch, build and run xpy. +# +# Author: Evan Harvey +################################################################################ + +function envprint() { + for x in $@; do + echo $x:\$$x | envsubst + done +} + +function printhelp() { + echo "--Usage--" + echo "$0 HOST_ARCH " + echo " HOST_ARCH: POWER9, A64FX, SKX" + echo " ACCELERATOR_ARCH: VOLTA70" + echo "" + echo "Invocation used for us-rse-escience-2022 results:" + echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh POWER9 VOLTA70" +} + +function earlyexit() { + rm -rf $benchmark_dir + exit $1 +} + +function beval() { + local ret=0 + echo "---------------------------------------------------------------------------------------------------------------" + echo "START: \"$@\"" + if [ $dry_run == "off" ]; then + eval $@ + ret=$PIPESTATUS + fi + if [ $ret -ne 0 ]; then + echo "ERROR: \"$@\"" + earlyexit 1 + fi + echo "END : \"$@\"" + echo "---------------------------------------------------------------------------------------------------------------" +} + +# Handle input args +export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"} +export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) +export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} +export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} +export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) +envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA + +dry_run="off" +arch_names="$1 $2" +echo "HOST_ARCH=\"$1\", ACCELERATOR_ARCH=\"$2\"" + +# Create benchmark directory +benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") +beval mkdir -p $benchmark_dir/kokkos-{build,install} +beval mkdir -p $benchmark_dir/kokkos-kernels-{build,install} +export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) +export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install) +export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build) +export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install) +envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR + +# Setup arch specific cmake configurations and job submission commands +if [[ "$arch_names" == " " ]]; then + printhelp; earlyexit 1 +elif [ "$arch_names" == "POWER9 VOLTA70" ]; then + module purge + module load cuda/11.2.0 gcc/8.3.1 cmake/3.18.0 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "A64FX " ]; then + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=48 + module purge + module load gcc/10.2.0 cmake/3.17.0 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=A64FX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-openmp \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SKX " ]; then + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=96 + module purge + module load gcc/7.2.0 cmake/3.19.3 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SKX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-O3' --arch=SKX --with-openmp \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" + use_simd="--use_simd=1" +else + echo "Invalid arch: $arch_names" + printhelp; earlyexit 1 +fi + +# Write the arch agnostic kokkos build script +echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh +echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh +echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh +chmod +x $KOKKOS_BUILD_DIR/build.sh + +# Write the arch agnostic kokkos-kernels build script +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "cd $KOKKOSKERNELS_BUILD_DIR/example/half" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "make -j40 xpy" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh + +# Write the arch agnostic kokkos-kernels benchmark script +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10 0 &> xpy_relative_error-10.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100 0 &> xpy_relative_error-100.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 1000 0 &> xpy_relative_error-1000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10000 0 &> xpy_relative_error-10000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100000 0 &> xpy_relative_error-100000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh + +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000 1 &> xpy_runtime_only-50000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000 1 &> xpy_runtime_only-500000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 5000000 1 &> xpy_runtime_only-5000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000000 1 &> xpy_runtime_only-50000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000000 1 &> xpy_runtime_only-500000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh + +# Check out the correct SHAs +beval "cd $KOKKOS_SRC_DIR && git checkout $KOKKOS_SHA" +beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA" + +# Build Kokkos +beval $kokkos_config_cmd +beval $kokkos_config_defaults_cmd +beval $kokkos_build_cmd + +# Wait for the file system on the head node to catch up +while [[ "$arch_names" == "POWER9 VOLTA70" && ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]]; do + sleep 3s +done + +# Build KokkosKernels +beval $kokkoskernels_config_cmd +beval $kokkoskernels_config_defaults_cmd +beval $kokkoskernels_build_cmd + +# Run the benchmark +beval $benchmark_cmd +beval "cat ${benchmark_dir}/xpy.out" diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp new file mode 100644 index 0000000000..3c909deca4 --- /dev/null +++ b/example/half/xpy.cpp @@ -0,0 +1,135 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_default_types.hpp" + +template +struct Functor_xpy { + ViewType x, y; + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { x(i) += y(i); } +}; + +template +void do_xpy(size_t n, bool time_only = false) { + using namespace Kokkos; + using ExecutionSpace = typename DeviceType::execution_space; + using ViewType = View; + using ReferenceScalarType = double; + + ViewType x("x", n); + ViewType y("y", n); + View x_rand("x_rand", n); + View y_rand("y_rand", n); + + View expected("expected", n); + View relative_error( + "relative_error", n); + typename ViewType::HostMirror x_host = create_mirror_view(x); + typename ViewType::HostMirror y_host = create_mirror_view(y); + // TODO: Report segfault in random_pool creation with: + // typename ViewType::HostMirror y_host = create_mirror_view(y_host); + + Random_XorShift64_Pool random_pool(12345); + fill_random(x_rand, random_pool, ReferenceScalarType(1.0), + ReferenceScalarType(2.0)); + fill_random(y_rand, random_pool, ReferenceScalarType(1.0), + ReferenceScalarType(2.0)); + ExecutionSpace().fence(); + + deep_copy(x, x_rand); + deep_copy(y, y_rand); + ExecutionSpace().fence(); + + deep_copy(x_host, x); + deep_copy(y_host, y); + ExecutionSpace().fence(); + + Functor_xpy xpy; + xpy.x = x; + xpy.y = y; + Timer timer; + parallel_for("xpy", n, xpy); + ExecutionSpace().fence(); + double s = timer.seconds(); + + if (!time_only) { + for (int i = 0; i < n; i++) + expected(i) = static_cast(y_host(i)) + + static_cast(x_host(i)); + } + + deep_copy(x_host, x); + ExecutionSpace().fence(); + + std::cout << "n: " << n << ", Runtime(s): " << s << std::endl; + + if (!time_only) { + std::cout << "-- " << typeid(ScalarType).name() << " Relative Errors --" + << std::endl; + for (int i = 0; i < n; i++) { + std::cout << std::abs(expected(i) - x_host(i)) / expected(i) << ", "; + } + std::cout << std::endl << std::endl; + } +} + +int main(int argc, char **argv) { + Kokkos::initialize(); + if (argc < 2) { + std::cout << "./" << argv[0] << " N:Z TIME_ONLY:{0,1}" << std::endl; + Kokkos::finalize(); + return 1; + } + using LayoutType = Kokkos::LayoutLeft; + using DeviceType = default_device; + size_t n = atoi(argv[1]); + bool time_only = static_cast(atoi(argv[2])); + do_xpy(n, time_only); + do_xpy(n, time_only); + do_xpy(n, time_only); + Kokkos::finalize(); + return 0; +} \ No newline at end of file From fcbfb8f6022521acca3556cb75f56d92c1d8eec8 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Wed, 25 May 2022 09:34:43 -0600 Subject: [PATCH 152/261] add stderr diagnostic message when getParameter on unset --- src/common/KokkosKernels_Controls.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/common/KokkosKernels_Controls.hpp b/src/common/KokkosKernels_Controls.hpp index 2e1a96a7a6..aabe0069be 100644 --- a/src/common/KokkosKernels_Controls.hpp +++ b/src/common/KokkosKernels_Controls.hpp @@ -92,6 +92,8 @@ class Controls { const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { + std::cerr << "WARNING: Controls::getParameter for name \"" << name + << "\" was unset" << std::endl; return orUnset; } else { return search->second; From 8d5658962d30c24f2c3cc1b98a6aa7fc670b4980 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 May 2022 11:03:31 -0600 Subject: [PATCH 153/261] example/half: - xpy.cpp: Update csv output - CMakeLists.txt: Remove comments - us-rse-escience-2022-reproducer.sh: Add "SNB VOLTA70 and "ZEN2 AMPERE80" --- example/half/CMakeLists.txt | 10 ----- .../half/us-rse-escience-2022-reproducer.sh | 44 +++++++++++++++++++ example/half/xpy.cpp | 14 +++--- 3 files changed, 52 insertions(+), 16 deletions(-) diff --git a/example/half/CMakeLists.txt b/example/half/CMakeLists.txt index 6516fdc8b7..49553f573f 100644 --- a/example/half/CMakeLists.txt +++ b/example/half/CMakeLists.txt @@ -5,13 +5,3 @@ KOKKOSKERNELS_ADD_EXECUTABLE( xpy SOURCES xpy.cpp ) - -#KOKKOSKERNELS_ADD_EXECUTABLE( -# spmv -# SOURCES spmv.cpp -# ) -# -#KOKKOSKERNELS_ADD_EXECUTABLE( -# dot -# SOURCES dot.cpp -# ) \ No newline at end of file diff --git a/example/half/us-rse-escience-2022-reproducer.sh b/example/half/us-rse-escience-2022-reproducer.sh index ef7ffb0eef..39c233eb1c 100755 --- a/example/half/us-rse-escience-2022-reproducer.sh +++ b/example/half/us-rse-escience-2022-reproducer.sh @@ -89,6 +89,50 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkos_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SNB VOLTA70" ]; then + module purge + module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "ZEN2 AMPERE80" ]; then + module purge + module load gcc/8.1.0 cuda/11.2.0 git/TODO cmake/TODO + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Zen2,Ampere80 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Zen2,Ampere80 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "A64FX " ]; then export OMP_PROC_BIND=close export OMP_PLACES=cores diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp index 3c909deca4..bc6bf7481d 100644 --- a/example/half/xpy.cpp +++ b/example/half/xpy.cpp @@ -96,7 +96,7 @@ void do_xpy(size_t n, bool time_only = false) { double s = timer.seconds(); if (!time_only) { - for (int i = 0; i < n; i++) + for (size_t i = 0; i < n; i++) expected(i) = static_cast(y_host(i)) + static_cast(x_host(i)); } @@ -104,13 +104,15 @@ void do_xpy(size_t n, bool time_only = false) { deep_copy(x_host, x); ExecutionSpace().fence(); - std::cout << "n: " << n << ", Runtime(s): " << s << std::endl; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() + << " Runtime(s): " << s << std::endl; if (!time_only) { - std::cout << "-- " << typeid(ScalarType).name() << " Relative Errors --" - << std::endl; - for (int i = 0; i < n; i++) { - std::cout << std::abs(expected(i) - x_host(i)) / expected(i) << ", "; + std::cout << "n: " << n << ", " << typeid(ScalarType).name() + << " Relative Errors:" << std::endl; + for (size_t i = 0; i < n; i++) { + std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) + << std::endl; } std::cout << std::endl << std::endl; } From 6ede53cbd5f4bee1a7af11272e273b823dd05757 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 25 May 2022 15:49:01 -0600 Subject: [PATCH 154/261] example/half: - Add Luc's fixes for AMPERE80 - Disable kokkos-kernels tests and enable examples - Remove cat of xpy.out --- .../half/us-rse-escience-2022-reproducer.sh | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/example/half/us-rse-escience-2022-reproducer.sh b/example/half/us-rse-escience-2022-reproducer.sh index 39c233eb1c..8e77f72bc4 100755 --- a/example/half/us-rse-escience-2022-reproducer.sh +++ b/example/half/us-rse-escience-2022-reproducer.sh @@ -15,10 +15,11 @@ function printhelp() { echo "--Usage--" echo "$0 HOST_ARCH " echo " HOST_ARCH: POWER9, A64FX, SKX" - echo " ACCELERATOR_ARCH: VOLTA70" + echo " ACCELERATOR_ARCH: VOLTA70, AMPERE80" echo "" - echo "Invocation used for us-rse-escience-2022 results:" + echo "Invocations used to collect us-rse-escience-2022 results:" echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh POWER9 VOLTA70" + echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh AMPERE80" } function earlyexit() { @@ -77,8 +78,8 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then | tee -a kokkos_config_cmd.out" kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ - --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ - --cxxflags='-O3' \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH -- --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" @@ -92,43 +93,42 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then elif [ "$arch_names" == "SNB VOLTA70" ]; then module purge module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1 - kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ - --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" - kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ | tee -a kokkos_config_cmd.out" - kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ - --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ - --cxxflags='-O3' \ - --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ - --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" - kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ - -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" -elif [ "$arch_names" == "ZEN2 AMPERE80" ]; then +elif [ "$arch_names" == "AMPERE80" ]; then module purge - module load gcc/8.1.0 cuda/11.2.0 git/TODO cmake/TODO - kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ - --arch=Zen2,Ampere80 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ - --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" - kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ - | tee -a kokkos_config_cmd.out" + module load cudatoolkit/11.2 cmake/3.22.0 - kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ - --arch=Zen2,Ampere80 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ - --cxxflags='-O3' \ - --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ - --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ - tee kokkoskernels_config_cmd.out" - kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ - -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ - $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out" + + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &> kokkos_config_cmd.out" + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out" + + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out" kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" @@ -147,6 +147,7 @@ elif [ "$arch_names" == "A64FX " ]; then kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-openmp \ + --disable-tests --enable-examples \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" @@ -168,16 +169,16 @@ elif [ "$arch_names" == "SKX " ]; then --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ | tee -a kokkos_config_cmd.out" - + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ - --cxxflags='-O3' --arch=SKX --with-openmp \ + --cxxflags='-O3' --arch=SKX --with-openmp --disable-tests --enable-examples \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" - + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" @@ -236,4 +237,3 @@ beval $kokkoskernels_build_cmd # Run the benchmark beval $benchmark_cmd -beval "cat ${benchmark_dir}/xpy.out" From 399e6d8bfd9530b9e7e9f04a27ab49ce7121ae76 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 26 May 2022 08:32:51 -0700 Subject: [PATCH 155/261] Add printf statements --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 74 ++++++++++++------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 20 ++++- 2 files changed, 68 insertions(+), 26 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 2c3c8dd1c2..0e8981cb81 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -244,28 +244,31 @@ struct ILUKLvlSchedTP1NumericFunctor { void operator()(const member_type &team) const { auto my_league = team.league_rank(); // map to rowid auto rowid = level_idx(my_league + lev_start); - auto my_team = team.team_rank(); + //auto my_team = team.team_rank(); auto k1 = L_row_map(rowid); auto k2 = L_row_map(rowid + 1); #ifdef KEEP_DIAG Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { - auto col = L_entries(k); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); L_values(k) = 0.0; + //if (iw(my_league, col) != -1) printf("L initialize k %d, col %d\n", k, col); iw(my_league, col) = k; }); #else Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - auto col = L_entries(k); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); L_values(k) = 0.0; iw(my_league, col) = k; }); #endif #ifdef KEEP_DIAG - if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0); + //if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif team.team_barrier(); @@ -273,9 +276,10 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = U_row_map(rowid); k2 = U_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - auto col = U_entries(k); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); U_values(k) = 0.0; + //if (iw(my_league, col) != -1) printf("U initialize k %d, col %d\n", k, col); iw(my_league, col) = k; }); @@ -285,9 +289,10 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = A_row_map(rowid); k2 = A_row_map(rowid + 1); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { - auto col = A_entries(k); - auto ipos = iw(my_league, col); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_league, col); + //if (ipos == -1) printf("A populate k %d, col %d\n", k, col); if (col < rowid) L_values(ipos) = A_values(k); else @@ -310,7 +315,8 @@ struct ILUKLvlSchedTP1NumericFunctor { #else auto fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - if (my_team == 0) L_values(k) = fact; + //if (my_team == 0) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); @@ -318,8 +324,8 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { - auto col = U_entries(kk); - auto ipos = iw(my_league, col); + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_league, col); if (ipos != -1) { auto lxu = -U_values(kk) * fact; if (col < rowid) @@ -332,19 +338,22 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); } // end for k - if (my_team == 0) { + //if (my_team == 0) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + nnz_lno_t ipos = iw(my_league, rowid); #ifdef KEEP_DIAG - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } #else - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } else { - U_values(iw(my_league, rowid)) = 1.0 / U_values(iw(my_league, rowid)); + U_values(ipos) = 1.0 / U_values(ipos); } #endif - } + }); + //} team.team_barrier(); @@ -354,18 +363,27 @@ struct ILUKLvlSchedTP1NumericFunctor { #ifdef KEEP_DIAG Kokkos::parallel_for( Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_league, col) = -1; + }); #else Kokkos::parallel_for( Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_league, col) = -1; + }); #endif k1 = U_row_map(rowid); k2 = U_row_map(rowid + 1); Kokkos::parallel_for( Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, U_entries(k)) = -1; }); + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_league, col) = -1; + }); } }; @@ -710,6 +728,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead + printf("work array iw %d x %d\n",iw.extent(0),iw.extent(1)); + int tmpcnt = 0; + int tmpnrows = 0; for (size_type lvl = 0; lvl < nlevels; ++lvl) { nnz_lno_t lev_start = level_ptr_h(lvl); nnz_lno_t lev_end = level_ptr_h(lvl + 1); @@ -758,12 +779,15 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nrows_chunk, team_size), tstf); - + Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; + tmpcnt++; + tmpnrows += lvl_nrows_chunk; } } } // end if } // end for lvl + printf("Total kernel calls %d, total nrows %d\n",tmpcnt, tmpnrows); } // Output check diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 18e0e54eef..411f91fb0b 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -54,6 +54,8 @@ #include #include +#include + //#define SYMBOLIC_OUTPUT_INFO namespace KokkosSparse { @@ -200,6 +202,10 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); + if ((i < 10) || (i >= nlevels-10)) + printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); + if (lnrows == 312) + printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); } else #endif { @@ -215,7 +221,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - level_nchunks = lnchunks; + level_nchunks = lnchunks; printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, maxrowsperchunk); level_nrowsperchunk = lnrowsperchunk; } @@ -447,6 +453,9 @@ void iluk_symbolic(IlukHandle& thandle, using HostTmpViewType = Kokkos::View; + struct timeval begin, end;//VINH TEST + gettimeofday( &begin, NULL ); + HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); @@ -580,7 +589,11 @@ void iluk_symbolic(IlukHandle& thandle, thandle.set_nnzL(cntL); thandle.set_nnzU(cntU); + gettimeofday( &end, NULL ); + printf(" VINH TEST: symbolic -- main %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + // Sort + gettimeofday( &begin, NULL ); for (size_type row_id = 0; row_id < static_cast(L_row_map.extent(0)) - 1; row_id++) { size_type row_start = L_row_map(row_id); @@ -593,8 +606,11 @@ void iluk_symbolic(IlukHandle& thandle, size_type row_end = U_row_map(row_id + 1); Kokkos::sort(subview(U_entries, Kokkos::make_pair(row_start, row_end))); } + gettimeofday( &end, NULL ); + printf(" VINH TEST: symbolic -- sort %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); // Level scheduling on L + gettimeofday( &begin, NULL ); if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, @@ -626,6 +642,8 @@ void iluk_symbolic(IlukHandle& thandle, Kokkos::deep_copy(U_entries_d, U_entries); thandle.set_symbolic_complete(); + gettimeofday( &end, NULL ); + printf(" VINH TEST: symbolic -- sched + copy %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); // Output check #ifdef SYMBOLIC_OUTPUT_INFO From 957ffa3de775d52a8ff71c1d9fddf6b6007b8dfb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 26 May 2022 08:09:28 -0600 Subject: [PATCH 156/261] cmake: - Add INST_BHALF option perf_test/blas/blas3: Update benchmark script - Use Kokkos 3.6.00 - Use KokkosKernels half_examples - Add Ampere80 - Added SNB VOLTA70 --- cmake/KokkosKernels_config.h.in | 2 + cmake/kokkoskernels_eti_floats.cmake | 8 +++ .../KokkosBatched_BatchedGemm_benchmark.sh | 57 ++++++++++++++++--- src/batched/KokkosBatched_Util.hpp | 3 +- src/common/KokkosKernels_default_types.hpp | 2 + test_common/KokkosKernels_TestUtils.hpp | 9 +++ 6 files changed, 73 insertions(+), 8 deletions(-) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index f8dd2ae133..1fb6a31544 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -70,6 +70,8 @@ #cmakedefine KOKKOSKERNELS_INST_FLOAT /* Whether to build kernels for scalar type Kokkos::Experimental::half_t */ #cmakedefine KOKKOSKERNELS_INST_HALF +/* Whether to build kernels for scalar type Kokkos::Experimental::bhalf_t */ +#cmakedefine KOKKOSKERNELS_INST_BHALF /* Whether to build kernels for scalar type complex */ #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE /* Whether to build kernels for scalar type complex */ diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake index debf99bb0e..3448874336 100644 --- a/cmake/kokkoskernels_eti_floats.cmake +++ b/cmake/kokkoskernels_eti_floats.cmake @@ -25,6 +25,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t. Disabling this may increase build times. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + INST_BHALF + OFF + BOOL + "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::bhalf_t. Disabling this may increase build times. Default: OFF" +) + SET(FLOATS FLOAT DOUBLE @@ -33,6 +40,7 @@ SET(FLOATS SET(DOUBLE_CPP_TYPE "double") SET(FLOAT_CPP_TYPE "float") SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t") +SET(BHALF_CPP_TYPE "Kokkos::Experimental::bhalf_t") SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex") SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex") diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 0b08977748..fdd9558b14 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -18,8 +18,8 @@ function printhelp() { echo "--Usage--" echo "$0 PRECISION HOST_ARCH " echo " PRECISION: Kokkos::Experimental::half_t, float, double" - echo " HOST_ARCH: POWER9, A64FX, SKX" - echo " ACCELERATOR_ARCH: VOLTA70" + echo " HOST_ARCH: POWER9, A64FX, SKX, SNB, DEFAULT" + echo " ACCELERATOR_ARCH: VOLTA70 AMPERE80" echo "" } @@ -47,10 +47,10 @@ function beval() { # Handle input args export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"} export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) -export KOKKOS_SHA=${KOKKOS_SHA:-"b9f15a4"} # Tip of develop as of 10-14-21 +export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"a2fff48"} # Tip of developer as of 10-14-21 +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"half_examples"} envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA dry_run="off" @@ -82,7 +82,7 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ - --cxxflags='-O3' --with-scalars=$precision \ + --cxxflags='-O3' --disable-tests --enable-examples --with-scalars=$precision \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" @@ -93,6 +93,49 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SNB VOLTA70" ]; then + module purge + module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --with-scalars=$precision \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "DEFAULT AMPERE80" ]; then + module purge + module load cudatoolkit/11.2 cmake/3.22.0 + + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out" + + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &> kokkos_config_cmd.out" + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --with-scalars=$precision \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out" + + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "A64FX " ]; then export OMP_PROC_BIND=close export OMP_PLACES=cores @@ -128,7 +171,7 @@ elif [ "$arch_names" == "SKX " ]; then --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ | tee -a kokkos_config_cmd.out" - + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --cxxflags='-O3' --arch=SKX --with-scalars=$precision --with-openmp \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ @@ -137,7 +180,7 @@ elif [ "$arch_names" == "SKX " ]; then kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" - + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 0d2eb7f395..cdb3c55d3c 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -201,7 +201,8 @@ struct SIMD { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same::value, + std::is_same::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type."); using value_type = T; }; diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp index 4012b2e158..d70a6b27ac 100644 --- a/src/common/KokkosKernels_default_types.hpp +++ b/src/common/KokkosKernels_default_types.hpp @@ -79,6 +79,8 @@ using default_scalar = double; using default_scalar = float; #elif defined(KOKKOSKERNELS_INST_HALF) using default_scalar = Kokkos::Experimental::half_t; +#elif defined(KOKKOSKERNELS_INST_BHALF) +using default_scalar = Kokkos::Experimental::bhalf_t; #else using default_scalar = double; #endif diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index ec27c44f50..a3a1ebf964 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -340,6 +340,15 @@ class epsilon { }; #endif // KOKKOS_HALF_T_IS_FLOAT +// explicit epsilon specializations +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +template <> +class epsilon { + public: + constexpr static double value = 0.0078125F; +}; +#endif // KOKKOS_HALF_T_IS_FLOAT + using KokkosKernels::Impl::getRandomBounds; template Date: Thu, 26 May 2022 12:01:22 -0400 Subject: [PATCH 157/261] Update SYCL CI --- scripts/docker/Dockerfile.sycl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index f5197ab7b3..3d94a1a45e 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -38,8 +38,8 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO ENV PATH=${CMAKE_DIR}/bin:$PATH ENV SYCL_DIR=/opt/sycl -RUN SYCL_VERSION=2021-09 && \ - SYCL_URL=https://github.com/intel/llvm/archive && \ +RUN SYCL_VERSION=20220112 && \ + SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \ From 22d9c9809685d5fbde0d6b98d6f2fa04bab43ee6 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Fri, 20 May 2022 14:39:22 -0600 Subject: [PATCH 158/261] Make cuSparse TPL available for Bsrmatrix SpMV The Kokkos::spmv function was improperly using template parameters to select the native vs TPL version. A common thread of erros was to assume the 3rd-to-last template parameter was for TPL availablility, when it was not. There were also further errors in inverting the logic on that parameter. We also remove LayoutRight for the BsrMatrix SpMV, as it is not supported by the underlying cuSparse function. for X,Y LayoutLeft we want cuSparse to do C = A * B + C and for X,Y LayoutRight we want cuSparse to do trans(C) = A * trans(B) + trans(C) -> t(t(C)) = t(A * t(B)) + t(t(C)) -> C = t(t(B)) * t(A) + C -> C = B * t(A) + C That is not possible with the current cuSparse level 3 functions. --- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 68 ++-------- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 116 +++++++++--------- src/sparse/KokkosSparse_spmv.hpp | 17 +-- 3 files changed, 80 insertions(+), 121 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index cd8287b38e..705422ff33 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -195,93 +195,49 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. - -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ - SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits, true> { \ - enum : bool { value = true }; \ +// cuSapars level 3 does not currently support LayoutRight +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, false> { \ + enum : bool { value = true }; \ }; #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // CUDA/CUSPARSE >= 9.0? diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 6ef47f8008..f73c09c712 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,8 +42,8 @@ //@HEADER */ -#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP -#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_Controls.hpp" #include "KokkosKernels_SparseUtils_mkl.hpp" @@ -562,8 +562,24 @@ void spmv_block_impl_cusparse( // - Only blockDim > 1 is supported // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. +// - Only LayoutLeft for X and Y: +// for X,Y LayoutLeft we want cuSparse to do +// C = A * B + C +// and for X,Y LayoutRight we want cuSparse to do +// trans(C) = A * trans(B) + trans(C) +// -> t(t(C)) = t(A * t(B)) + t(t(C)) +// -> C = t(t(B)) * t(A) + C +// -> C = B * t(A) + C +// This is impossible in cuSparse without explicitly transposing C, +// so we just do not support LayoutRight in cuSparse TPL now // -template +template < + class AMatrix, class XVector, class YVector, + std::enable_if_t::value && + std::is_same::value, + bool> = true> void spm_mv_block_impl_cusparse( const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, @@ -587,8 +603,15 @@ void spm_mv_block_impl_cusparse( } int colx = static_cast(x.extent(1)); - int ldx = static_cast(x.stride_1()); - int ldy = static_cast(y.stride_1()); + + // ldx and ldy should be the leading dimension of X,Y respectively + const int ldx = static_cast(x.extent(0)); + const int ldy = static_cast(y.extent(0)); + if (!std::is_same::value) { + std::cerr << "X,Y must be LayoutLeft cusparse[*]bsrmv.\n"; + throw std::invalid_argument("Invalid layout"); + } #if (9000 <= CUDA_VERSION) @@ -745,29 +768,31 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_CUSPARSE -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +// cuSparse TPL does not support LayoutRight for this operation +// only specialize for LayoutLeft +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \ + ETI_AVAIL) \ template <> \ struct SPMV_MV_BSRMATRIX< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ - LAYOUT, Kokkos::Device, \ + Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + SCALAR**, Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, false, true, ETI_AVAIL> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using AMatrix = BsrMatrix; \ using XVector = Kokkos::View< \ - SCALAR const**, LAYOUT, device_type, \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ + using YVector = Kokkos::View; \ using Controls = KokkosKernels::Experimental::Controls; \ \ using coefficient_type = typename YVector::non_const_value_type; \ @@ -786,55 +811,32 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, }; #if (9000 <= CUDA_VERSION) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif + Kokkos::CudaUVMSpace, false) + +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE @@ -842,6 +844,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 52c9b4e0bf..972bbc74ad 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -894,8 +894,10 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available - bool useFallback = controls.isParameter("algorithm") && - controls.getParameter("algorithm") == "native"; + bool useFallback = + controls.isParameter("algorithm") && + (controls.getParameter("algorithm") == "native" || + controls.getParameter("algorithm") == "experimental_bsr_tc"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) @@ -936,6 +938,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::array_layout, typename YVector_Internal::device_type, typename YVector_Internal::memory_traits, + std::is_integral::value, false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { @@ -952,11 +955,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::value_type**, typename YVector_Internal::array_layout, typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls, - mode, - alpha, A_i, - x_i, beta, - y_i); + typename YVector_Internal::memory_traits, + std::is_integral::value>:: + spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); } } @@ -1097,7 +1098,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// entries of y; if alpha == 0, ignore the entries of A and x. /// /// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on +/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on /// Volta or Ampere architectures. On Volta-architecture GPUs the only available /// precision is mixed-precision fp32 accumulator from fp16 inputs. On /// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, From f9f433bf2baf5d7cf10c07e497fed50d8e71a2f8 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Fri, 27 May 2022 00:29:04 -0600 Subject: [PATCH 159/261] Test --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 0e8981cb81..53808882c9 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -335,6 +335,20 @@ struct ILUKLvlSchedTP1NumericFunctor { } }); // end for kk + //Kokkos::single(Kokkos::PerTeam(team), [&]() { + // for (size_type kk = U_row_map(prev_row) + 1; kk < U_row_map(prev_row + 1); kk++) { + // nnz_lno_t col = static_cast(U_entries(kk)); + // nnz_lno_t ipos = iw(my_league, col); + // if (ipos != -1) { + // auto lxu = -U_values(kk) * fact; + // if (col < rowid) + // Kokkos::atomic_add(&L_values(ipos), lxu); + // else + // Kokkos::atomic_add(&U_values(ipos), lxu); + // } + // } // end for kk + //}); + team.team_barrier(); } // end for k From 2eb530c3ed373fccde8f84871ce14e8b21c6e9af Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 30 May 2022 21:43:05 -0700 Subject: [PATCH 160/261] Some changes to symbolic and mumeric of spiluk --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 256 ++++++++++-------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 3 +- 2 files changed, 151 insertions(+), 108 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 53808882c9..a4733d5379 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -53,6 +53,7 @@ #include //#define NUMERIC_OUTPUT_INFO +//#define NUMERIC_USE_FOR namespace KokkosSparse { namespace Impl { @@ -207,18 +208,18 @@ struct ILUKLvlSchedTP1NumericFunctor { using lno_t = typename AEntriesType::non_const_value_type; using scalar_t = typename AValuesType::non_const_value_type; - ARowMapType A_row_map; + ARowMapType A_row_map; AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; + AValuesType A_values; + LRowMapType L_row_map; LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; + LValuesType L_values; + URowMapType U_row_map; UEntriesType U_entries; - UValuesType U_values; + UValuesType U_values; LevelViewType level_idx; WorkViewType iw; - nnz_lno_t lev_start; + nnz_lno_t lev_start; ILUKLvlSchedTP1NumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, @@ -242,119 +243,144 @@ struct ILUKLvlSchedTP1NumericFunctor { KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = level_idx(my_league + lev_start); - //auto my_team = team.team_rank(); + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = static_cast(level_idx(my_team + lev_start));// map to rowid + nnz_lno_t my_thread = static_cast(team.team_rank()); + nnz_lno_t ts = static_cast(team.team_size()); - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); + nnz_lno_t k1 = static_cast(L_row_map(rowid)); + nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - //if (iw(my_league, col) != -1) printf("L initialize k %d, col %d\n", k, col); - iw(my_league, col) = k; - }); +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + }); #else - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_league, col) = k; - }); + for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + } +#endif +#else +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + }); +#else + for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + } +#endif #endif #ifdef KEEP_DIAG - //if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0); + //if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif team.team_barrier(); - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - //if (iw(my_league, col) != -1) printf("U initialize k %d, col %d\n", k, col); - iw(my_league, col) = k; - }); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = k; + }); +#else + for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = k; + } +#endif team.team_barrier(); // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_league, col); - //if (ipos == -1) printf("A populate k %d, col %d\n", k, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - }); + k1 = static_cast(A_row_map(rowid)); + k2 = static_cast(A_row_map(rowid + 1)); +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + }); +#else + for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + } +#endif team.team_barrier(); // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { + for (nnz_lno_t k = k1; k < k2 - 1; k++) #else - for (auto k = k1; k < k2; ++k) { + for (nnz_lno_t k = k1; k < k2; k++) #endif - auto prev_row = L_entries(k); + { + nnz_lno_t prev_row = L_entries(k); #ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); #else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - //if (my_team == 0) L_values(k) = fact; + //if (my_thread == 0) L_values(k) = fact; Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, - U_row_map(prev_row + 1)), - [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_league, col); - if (ipos != -1) { - auto lxu = -U_values(kk) * fact; - if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); - else - Kokkos::atomic_add(&U_values(ipos), lxu); - } - }); // end for kk - - //Kokkos::single(Kokkos::PerTeam(team), [&]() { - // for (size_type kk = U_row_map(prev_row) + 1; kk < U_row_map(prev_row + 1); kk++) { - // nnz_lno_t col = static_cast(U_entries(kk)); - // nnz_lno_t ipos = iw(my_league, col); - // if (ipos != -1) { - // auto lxu = -U_values(kk) * fact; - // if (col < rowid) - // Kokkos::atomic_add(&L_values(ipos), lxu); - // else - // Kokkos::atomic_add(&U_values(ipos), lxu); - // } - // } // end for kk - //}); - +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + if (ipos != -1) { + auto lxu = -U_values(kk) * fact; + if (col < rowid) + Kokkos::atomic_add(&L_values(ipos), lxu); + else + Kokkos::atomic_add(&U_values(ipos), lxu); + } + }); // end for kk +#else + for (nnz_lno_t kk = U_row_map(prev_row) + 1 + my_thread; kk < U_row_map(prev_row + 1); kk += ts) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + if (ipos != -1) { + auto lxu = -U_values(kk) * fact; + if (col < rowid) + Kokkos::atomic_add(&L_values(ipos), lxu); + else + Kokkos::atomic_add(&U_values(ipos), lxu); + } + } // end for kk +#endif team.team_barrier(); } // end for k - //if (my_team == 0) { + //if (my_thread == 0) { Kokkos::single(Kokkos::PerTeam(team), [&]() { - nnz_lno_t ipos = iw(my_league, rowid); + nnz_lno_t ipos = iw(my_team, rowid); #ifdef KEEP_DIAG if (U_values(ipos) == 0.0) { U_values(ipos) = 1e6; @@ -372,32 +398,47 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); // Reset - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_league, col) = -1; +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; }); #else - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + } +#endif +#else +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(L_entries(k)); - iw(my_league, col) = -1; + iw(my_team, col) = -1; }); +#else + for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + } +#endif #endif - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_league, col) = -1; + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); +#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; }); +#else + for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; + } +#endif } }; @@ -742,7 +783,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead - printf("work array iw %d x %d\n",iw.extent(0),iw.extent(1)); + printf("work array iw %d x %d, type %s\n",iw.extent(0),iw.extent(1),typeid(WorkViewType).name()); int tmpcnt = 0; int tmpnrows = 0; for (size_type lvl = 0; lvl < nlevels; ++lvl) { @@ -794,9 +835,10 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, policy_type(lvl_nrows_chunk, team_size), tstf); Kokkos::fence(); - lvl_rowid_start += lvl_nrows_chunk; tmpcnt++; tmpnrows += lvl_nrows_chunk; + + lvl_rowid_start += lvl_nrows_chunk; } } } // end if diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 411f91fb0b..817ee69626 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -204,7 +204,8 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, : (lnrows / lnchunks(i) + 1); if ((i < 10) || (i >= nlevels-10)) printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); - if (lnrows == 312) + //if (lnrows == 312) + if (lnrows > 250) printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); } else #endif From 15b3e00caa098ddc0f9f814c14cba6d06f58a8b5 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Tue, 31 May 2022 09:46:56 -0600 Subject: [PATCH 161/261] remove spurious runtime check that X is LayoutLeft --- src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index f73c09c712..77b76868f3 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -607,11 +607,6 @@ void spm_mv_block_impl_cusparse( // ldx and ldy should be the leading dimension of X,Y respectively const int ldx = static_cast(x.extent(0)); const int ldy = static_cast(y.extent(0)); - if (!std::is_same::value) { - std::cerr << "X,Y must be LayoutLeft cusparse[*]bsrmv.\n"; - throw std::invalid_argument("Invalid layout"); - } #if (9000 <= CUDA_VERSION) From d91c9b60539b24d6585f21c28862b5eaaf6487f0 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Tue, 31 May 2022 09:47:11 -0600 Subject: [PATCH 162/261] fix typo in comment --- src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index 705422ff33..57170d6eb6 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -195,7 +195,7 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. -// cuSapars level 3 does not currently support LayoutRight +// cuSparse level 3 does not currently support LayoutRight #define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ template <> \ From c03fda78ac38684c200baf7ee6c5d7841dbd7ac6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 May 2022 11:37:49 -0600 Subject: [PATCH 163/261] perf_test/blas/blas3: Check for bhalf in __gemm_flop_count --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 2d87567c6f..d1855573e4 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -263,9 +263,11 @@ static std::string gemm_csv_header_str = // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { + // TODO: if not Kokkos::complex. if (std::is_same::value || std::is_same::value || - std::is_same::value) + std::is_same::value || + std::is_same::value) return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each From 5d596312dbb86ebb7eb658c3b24bab9ee6dfd0e5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 May 2022 11:38:47 -0600 Subject: [PATCH 164/261] perf_test/blas/blas3: Use same branch name as code-examples --- perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index fdd9558b14..4408db4f00 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -50,7 +50,7 @@ export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"half_examples"} +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"half-precision"} envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA dry_run="off" From 6542cfb7841439c2061805f185aac45cea9826c4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 May 2022 14:50:51 -0600 Subject: [PATCH 165/261] perf_test/blas/blas3: Use tag and add reproducer instructions --- .../KokkosBatched_BatchedGemm_benchmark.sh | 2 +- .../reproducer.md | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 4408db4f00..d94197c046 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -50,7 +50,7 @@ export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"half-precision"} +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"tags/us-rse-escience-2022"} envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA dry_run="off" diff --git a/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md new file mode 100644 index 0000000000..4d3bc72173 --- /dev/null +++ b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md @@ -0,0 +1,26 @@ +## To reproduce the half precision results for batched-GEMM: +```bash +git clone https://github.com/kokkos/kokkos.git +git clone https://github.com/kokkos/kokkos-kernels.git +cd kokkos-kernels +git checkout tags/us-rse-escience-2022 +cd perf_test/blas/blas3 +export KOKKOS_SRC_DIR=/path/to/kokkos +export KOKKOSKERNELS_SRC_DIR=/path/to/kokkos-kernels +``` + +### On V100 +```bash +./KokkosBatched_BatchedGemm_benchmark.sh double SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh float SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh half SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh bhalf SNB VOLTA70 +``` + +### On A100 +```bash +./KokkosBatched_BatchedGemm_benchmark.sh double DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh float DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh half DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh bhalf DEFAULT AMPERE80 +``` \ No newline at end of file From 06a87f8a67825a286eca68bd8031cb57c5869d93 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 May 2022 14:59:08 -0600 Subject: [PATCH 166/261] perf_test/blas/blas3: Update tags --- perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh | 2 +- .../blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index d94197c046..f2dd832125 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -50,7 +50,7 @@ export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"tags/us-rse-escience-2022"} +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"tags/papers/us-rse-escience-2022"} envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA dry_run="off" diff --git a/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md index 4d3bc72173..e558abbff6 100644 --- a/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md +++ b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md @@ -3,7 +3,7 @@ git clone https://github.com/kokkos/kokkos.git git clone https://github.com/kokkos/kokkos-kernels.git cd kokkos-kernels -git checkout tags/us-rse-escience-2022 +git checkout tags/papers/us-rse-escience-2022 cd perf_test/blas/blas3 export KOKKOS_SRC_DIR=/path/to/kokkos export KOKKOSKERNELS_SRC_DIR=/path/to/kokkos-kernels From bae78fbe26010371a6623769ab49f05d126bd9bf Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 31 May 2022 16:47:46 -0600 Subject: [PATCH 167/261] perf_test/blas/blas3: Increase benchmark batch size --- perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index f2dd832125..3b382a474c 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -208,7 +208,7 @@ echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \ --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \ - --matrix_size_step=2 --batch_size=1024 \ + --matrix_size_step=2 --batch_size=$((32*1024)) \ --warm_up_loop=10 --iter=20 --verify=1 \ ${use_simd} \ --csv=${benchmark_dir}/${precision}_bench.csv" \ From 7bbbb43662cf0316620fb758f84f92c90c68540d Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 1 Jun 2022 18:00:43 -0600 Subject: [PATCH 168/261] ArithTraits: adding macros to reduce code and __float128 Adding a few macros that allow a more generic implementation of the various ArithTraits, this should make maintenance easier. Also refactoring the __float128 trait to use the Kokkos implementation and adding it to the generic unit-test. --- src/common/Kokkos_ArithTraits.hpp | 2656 +++++------------- unit_test/common/Test_Common_ArithTraits.hpp | 4 + 2 files changed, 705 insertions(+), 1955 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index cd681488dd..bb128d32c1 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -55,9 +55,7 @@ #include #include -#ifdef HAVE_KOKKOSKERNELS_QUADMATH -#include -#endif // HAVE_KOKKOSKERNELS_QUADMATH +#include #include #include @@ -227,6 +225,352 @@ KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, namespace Kokkos { namespace Details { +// Macro to automate the wrapping of Kokkos Mathematical Functions +// in the ArithTraits struct for real floating point types, hopefully +// this can be expanded to Kokkos::half_t and Kokkos::bhalf_t +#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { \ + return static_cast(0.0); \ + } \ + static FUNC_QUAL val_type one() { \ + return static_cast(1.0); \ + } \ + static FUNC_QUAL val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static FUNC_QUAL val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static FUNC_QUAL val_type infinity() { \ + return Kokkos::Experimental::infinity::value; \ + } \ + static FUNC_QUAL val_type nan() { \ + return Kokkos::Experimental::quiet_NaN::value; \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return Kokkos::Experimental::epsilon::value; \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int base() { \ + return Kokkos::Experimental::radix::value; \ + } \ + static FUNC_QUAL mag_type prec() { \ + return epsilon() * static_cast(base()); \ + } \ + static FUNC_QUAL int t() { \ + return Kokkos::Experimental::digits::value; \ + } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { \ + return Kokkos::Experimental::min_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int emax() { \ + return Kokkos::Experimental::max_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return Kokkos::Experimental::finite_max< \ + val_type>::value; \ + } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return Kokkos::isinf(x); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return Kokkos::isnan(x); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { \ + return Kokkos::abs(x); \ + } \ + static FUNC_QUAL mag_type real(const val_type x) { \ + return x; \ + } \ + static FUNC_QUAL mag_type imag(const val_type) { \ + return zero(); \ + } \ + static FUNC_QUAL val_type conj(const val_type x) { \ + return x; \ + } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { \ + return Kokkos::sqrt(x); \ + } \ + static FUNC_QUAL val_type cbrt(const val_type x) { \ + return Kokkos::cbrt(x); \ + } \ + static FUNC_QUAL val_type exp(const val_type x) { \ + return Kokkos::exp(x); \ + } \ + static FUNC_QUAL val_type log(const val_type x) { \ + return Kokkos::log(x); \ + } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { \ + return Kokkos::sin(x); \ + } \ + static FUNC_QUAL val_type cos(const val_type x) { \ + return Kokkos::cos(x); \ + } \ + static FUNC_QUAL val_type tan(const val_type x) { \ + return Kokkos::tan(x); \ + } \ + static FUNC_QUAL val_type sinh(const val_type x) { \ + return Kokkos::sinh(x); \ + } \ + static FUNC_QUAL val_type cosh(const val_type x) { \ + return Kokkos::cosh(x); \ + } \ + static FUNC_QUAL val_type tanh(const val_type x) { \ + return Kokkos::tanh(x); \ + } \ + static FUNC_QUAL val_type asin(const val_type x) { \ + return Kokkos::asin(x); \ + } \ + static FUNC_QUAL val_type acos(const val_type x) { \ + return Kokkos::acos(x); \ + } \ + static FUNC_QUAL val_type atan(const val_type x) { \ + return Kokkos::atan(x); \ + } \ + \ + static FUNC_QUAL bool isnaninf(const val_type x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { \ + return conj(x); \ + } \ + static FUNC_QUAL val_type squareroot(const val_type x) { \ + return sqrt(x); \ + } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + +#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { \ + return val_type(ArithTraits::zero(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type one() { \ + return val_type(ArithTraits::one(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type min() { \ + return val_type(ArithTraits::min(), \ + ArithTraits::min()); \ + } \ + static FUNC_QUAL val_type max() { \ + return val_type(ArithTraits::max(), \ + ArithTraits::max()); \ + } \ + static FUNC_QUAL val_type infinity() { \ + return val_type(ArithTraits::infinity(), \ + ArithTraits::infinity()); \ + } \ + static FUNC_QUAL val_type nan() { \ + return val_type(ArithTraits::nan(), \ + ArithTraits::nan()); \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return ArithTraits::epsilon(); \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return ArithTraits::sfmin(); \ + } \ + static FUNC_QUAL int base() { \ + return ArithTraits::base(); \ + } \ + static FUNC_QUAL mag_type prec() { \ + return ArithTraits::prec(); \ + } \ + static FUNC_QUAL int t() { \ + return ArithTraits::t(); \ + } \ + static FUNC_QUAL mag_type rnd() { \ + return ArithTraits::rnd(); \ + } \ + static FUNC_QUAL int emin() { \ + return ArithTraits::emin(); \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return ArithTraits::rmin(); \ + } \ + static FUNC_QUAL int emax() { \ + return ArithTraits::emax(); \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return ArithTraits::rmax(); \ + } \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return ArithTraits::isInf(x.real()) || \ + ArithTraits::isInf(x.imag()); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return ArithTraits::isNan(x.real()) || \ + ArithTraits::isNan(x.imag()); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { \ + return ::Kokkos::abs(x); \ + } \ + static FUNC_QUAL mag_type real(const val_type x) { \ + return x.real(); \ + } \ + static FUNC_QUAL mag_type imag(const val_type x) { \ + return x.imag(); \ + } \ + static FUNC_QUAL val_type conj(const val_type x) { \ + return ::Kokkos::conj(x); \ + } \ + static FUNC_QUAL val_type pow (const val_type x, const \ + val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow (const val_type x, const \ + mag_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow (const mag_type x, const \ + val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { \ + return ::Kokkos::sqrt(x); \ + } \ + static FUNC_QUAL val_type exp (const val_type x) { \ + return Kokkos::exp(x); \ + } \ + static FUNC_QUAL val_type log (const val_type x) { \ + return Kokkos::log(x); \ + } \ + static FUNC_QUAL val_type log10 (const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin (const val_type x) { \ + return Kokkos::sin(x); \ + } \ + static FUNC_QUAL val_type cos (const val_type x) { \ + return Kokkos::cos(x); \ + } \ + static FUNC_QUAL val_type tan (const val_type x) { \ + return Kokkos::tan(x); \ + } \ + static FUNC_QUAL val_type sinh (const val_type x) { \ + return Kokkos::sinh(x); \ + } \ + static FUNC_QUAL val_type cosh (const val_type x) { \ + return Kokkos::cosh(x); \ + } \ + static FUNC_QUAL val_type tanh (const val_type x) { \ + return Kokkos::tanh(x); \ + } \ + static FUNC_QUAL val_type asin (const val_type x) { \ + return Kokkos::asin(x); \ + } \ + static FUNC_QUAL val_type acos (const val_type x) { \ + return Kokkos::acos(x); \ + } \ + static FUNC_QUAL val_type atan (const val_type x) { \ + return Kokkos::atan(x); \ + } \ + static FUNC_QUAL bool isnaninf(const val_type& x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL mag_type magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { \ + return conj(x); \ + } \ + static FUNC_QUAL val_type squareroot (const val_type x) { \ + return sqrt (x); \ + } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + +#define KOKKOSKERNELS_SIGNED_ABS \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ + return Kokkos::abs(x); \ + } \ + +#define KOKKOSKERNELS_UNSIGNED_ABS \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ + return x; \ + } \ + +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS) \ + static KOKKOS_FUNCTION val_type zero() { \ + return static_cast(0); \ + } \ + static KOKKOS_FUNCTION val_type one() { \ + return static_cast(1); \ + } \ + static KOKKOS_FUNCTION val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static KOKKOS_FUNCTION val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static KOKKOS_FUNCTION val_type infinity() { \ + return static_cast(0); \ + } \ + static KOKKOS_FUNCTION bool isInf(const val_type) { \ + return false; \ + } \ + static KOKKOS_FUNCTION bool isNan(const val_type) { \ + return false; \ + } \ + KOKKOSKERNELS_ABS \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { \ + return x; \ + } \ + static KOKKOS_FUNCTION mag_type imag(const val_type) { \ + return zero(); \ + } \ + static KOKKOS_FUNCTION val_type conj(const val_type x) { \ + return x; \ + } \ + static KOKKOS_FUNCTION val_type pow(const val_type x, \ + const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ + return static_cast(Kokkos::sqrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ + return static_cast(Kokkos::cbrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type exp(const val_type x) { \ + return static_cast(Kokkos::exp(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log(const val_type x) { \ + return static_cast(Kokkos::log(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log10(const val_type x) { \ + return static_cast(Kokkos::log10(abs(x))); \ + } \ + static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ + return conj(x); \ + } \ + static KOKKOS_FUNCTION bool isnaninf(const val_type) { \ + return false; \ + } \ + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ + return sqrt(x); \ + } + + /// \class ArithTraits /// \brief Traits class for arithmetic on type T. /// \tparam T "Scalar" type of interest @@ -383,7 +727,7 @@ class ArithTraits { /// Unfortunately we can't call this "isinf" (the equivalent C99 /// function), because CUDA appears to implement that function using /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const T& x); + static KOKKOS_FUNCTION bool isInf(const T& x); /// \brief Whether x is NaN (not a number). /// @@ -394,16 +738,16 @@ class ArithTraits { /// Unfortunately we can't call this "isnan" (the equivalent C99 /// function), because CUDA appears to implement that function using /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const T& x); + static KOKKOS_FUNCTION bool isNan(const T& x); //! The absolute value (magnitude) of x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const T& x); + static KOKKOS_FUNCTION mag_type abs(const T& x); //! The zero value of T; the arithmetic identity. - static KOKKOS_FORCEINLINE_FUNCTION T zero(); + static KOKKOS_FUNCTION T zero(); //! The one value of T; the multiplicative identity. - static KOKKOS_FORCEINLINE_FUNCTION T one(); + static KOKKOS_FUNCTION T one(); /// \brief True if this type T is capable of representing the /// positive infinity as a distinct special value, as with @@ -418,34 +762,34 @@ class ArithTraits { /// \note Would have liked to mark it as constexpr but then would /// not be able to provide the specialization for std::complex /// since its constructor only becomes constexpr with C++14. - static KOKKOS_FORCEINLINE_FUNCTION T infinity(); + static KOKKOS_FUNCTION T infinity(); /// \brief The minimum possible value of T. /// /// If T is a real floating-point type, then this is the minimum /// positive value, as with std::numeric_limits::min(). - static KOKKOS_FORCEINLINE_FUNCTION T min(); + static KOKKOS_FUNCTION T min(); //! The maximum possible value of T. - static KOKKOS_FORCEINLINE_FUNCTION T max(); + static KOKKOS_FUNCTION T max(); /// \brief The real part of x. /// /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const T& x); + static KOKKOS_FUNCTION mag_type real(const T& x); /// \brief The imaginary part of x. /// /// If \c is_complex is false, then this just returns zero(). - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const T&); + static KOKKOS_FUNCTION mag_type imag(const T&); /// \brief The complex conjugate of x. /// /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION T conj(const T&); + static KOKKOS_FUNCTION T conj(const T&); //! x raised to the power y. - static KOKKOS_FORCEINLINE_FUNCTION T pow(const T& x, const T& y); + static KOKKOS_FUNCTION T pow(const T& x, const T& y); /// \brief The square root of x. /// @@ -458,7 +802,7 @@ class ArithTraits { /// exceptions in device functions.) Implementations should return /// NaN if the type T supports this. Of course, in that case, the /// square of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T sqrt(const T& x); + static KOKKOS_FUNCTION T sqrt(const T& x); /// \brief The cubic root of x. /// @@ -471,7 +815,7 @@ class ArithTraits { /// exceptions in device functions.) Implementations should return /// NaN if the type T supports this. Of course, in that case, the /// cubic of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T cbrt(const T& x); + static KOKKOS_FUNCTION T cbrt(const T& x); /// \brief The natural (base e) exponential function of x. /// @@ -479,7 +823,7 @@ class ArithTraits { /// function. If T is a complex-valued type, then this method /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$. /// - static KOKKOS_FORCEINLINE_FUNCTION T exp(const T& x); + static KOKKOS_FUNCTION T exp(const T& x); /// \brief The natural (base e) logarithm of x. /// @@ -492,7 +836,7 @@ class ArithTraits { /// throwing exceptions in device functions.) Implementations /// should return NaN if the type T supports this. Of course, in /// that case, if y is the result, \f$e^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log(const T& x); + static KOKKOS_FUNCTION T log(const T& x); /// \brief The base ten logarithm of the input. /// @@ -505,7 +849,7 @@ class ArithTraits { /// throwing exceptions in device functions.) Implementations /// should return NaN if the type T supports this. Of course, in /// that case, if y is the result, \f$10^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log10(const T& x); + static KOKKOS_FUNCTION T log10(const T& x); /// Trigonometric and hyperbolic functions are not available /// for integer types. This is because asin(sin(x)) is not x @@ -517,52 +861,52 @@ class ArithTraits { /// \brief The sin function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T sin(const T& x); + static KOKKOS_FUNCTION T sin(const T& x); /// \brief The cos function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T cos(const T& x); + static KOKKOS_FUNCTION T cos(const T& x); /// \brief The tan function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T tan(const T& x); + static KOKKOS_FUNCTION T tan(const T& x); /// \brief The sin hyperbolic function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T sinh(const T& x); + static KOKKOS_FUNCTION T sinh(const T& x); /// \brief The cos hyperbolic function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T cosh(const T& x); + static KOKKOS_FUNCTION T cosh(const T& x); /// \brief The tan hyperbolic function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T tanh(const T& x); + static KOKKOS_FUNCTION T tanh(const T& x); /// \brief The asin function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T asin(const T& x); + static KOKKOS_FUNCTION T asin(const T& x); /// \brief The acos function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T acos(const T& x); + static KOKKOS_FUNCTION T acos(const T& x); /// \brief The atan function of x /// - static KOKKOS_FORCEINLINE_FUNCTION T atan(const T& x); + static KOKKOS_FUNCTION T atan(const T& x); /// \brief Return a silent NaN, if appropriate for T. /// /// If T does not implement a silent NaN, the return value is /// undefined, but calling this method is still allowed. - static KOKKOS_FORCEINLINE_FUNCTION T nan(); + static KOKKOS_FUNCTION T nan(); /// \brief Machine epsilon. /// /// If T is an integer type (std::numeric_traits::is_exact is /// true), then epsilon() returns 0. Otherwise, if T is a /// floating-point type, it returns machine epsilon that T. - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon(); + static KOKKOS_FUNCTION mag_type epsilon(); //@{ /// \name Traits defined for backwards compatibility with @@ -602,45 +946,45 @@ class ArithTraits { static constexpr bool hasMachineParameters = false; //! Return relative machine precision. - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps(); + static KOKKOS_FUNCTION mag_type eps(); //! Return safe minimum (sfmin), such that 1/sfmin does not overflow. - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin(); + static KOKKOS_FUNCTION mag_type sfmin(); //! Return the base of the scalar type T. - static KOKKOS_FORCEINLINE_FUNCTION int base(); + static KOKKOS_FUNCTION int base(); //! Return eps*base. - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec(); + static KOKKOS_FUNCTION mag_type prec(); //! Returns the number of (base) digits in the significand. - static KOKKOS_FORCEINLINE_FUNCTION int t(); + static KOKKOS_FUNCTION int t(); //! 1.0 when rounding occurs in addition, else 0.0. - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd(); + static KOKKOS_FUNCTION mag_type rnd(); //! Returns the minimum exponent before (gradual) underflow. - static KOKKOS_FORCEINLINE_FUNCTION int emin(); + static KOKKOS_FUNCTION int emin(); //! Returns the underflow threshold: base^(emin-1) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin(); + static KOKKOS_FUNCTION mag_type rmin(); //! Returns the largest exponent before overflow. - static KOKKOS_FORCEINLINE_FUNCTION int emax(); + static KOKKOS_FUNCTION int emax(); //! Overflow theshold: (base^emax)*(1-eps) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax(); + static KOKKOS_FUNCTION mag_type rmax(); //! Same as abs(); return the magnitude of x. - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const T& x); + static KOKKOS_FUNCTION magnitudeType magnitude(const T& x); //! Same as conj(); return the complex conjugate of x. - static KOKKOS_FORCEINLINE_FUNCTION T conjugate(const T& x); + static KOKKOS_FUNCTION T conjugate(const T& x); /// \brief Whether x is (silent) NaN or Inf. /// /// This is the same as isNan(x) || isInf(x). - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const T& x); + static KOKKOS_FUNCTION bool isnaninf(const T& x); /// \brief The string name of T. /// @@ -648,7 +992,7 @@ class ArithTraits { static std::string name(); //! Same as sqrt(x); the square root of x. - static KOKKOS_FORCEINLINE_FUNCTION T squareroot(const T& x); + static KOKKOS_FUNCTION T squareroot(const T& x); //@} }; @@ -668,111 +1012,111 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_half( Kokkos::Experimental::infinity::value); } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static KOKKOS_FUNCTION bool isInf(const val_type x) { #ifndef __CUDA_ARCH__ using std::isinf; #endif return isinf(Kokkos::Experimental::cast_from_half(x)); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static KOKKOS_FUNCTION bool isNan(const val_type x) { #ifndef __CUDA_ARCH__ using std::isnan; #endif return isnan(Kokkos::Experimental::cast_from_half(x)); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static KOKKOS_FUNCTION mag_type abs(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::abs(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static KOKKOS_FUNCTION val_type zero() { return Kokkos::Experimental::cast_to_half(0.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static KOKKOS_FUNCTION val_type one() { return Kokkos::Experimental::cast_to_half(1.0); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static KOKKOS_FUNCTION val_type min() { return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_half( Kokkos::pow(Kokkos::Experimental::cast_from_half(x), Kokkos::Experimental::cast_from_half(y))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::sqrt(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::cbrt(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { + static KOKKOS_FUNCTION val_type exp(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::exp(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { + static KOKKOS_FUNCTION val_type log(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::log(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { + static KOKKOS_FUNCTION val_type log10(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::log10(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static KOKKOS_FUNCTION val_type sin(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::sin(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static KOKKOS_FUNCTION val_type cos(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::cos(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static KOKKOS_FUNCTION val_type tan(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::tan(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::sinh(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::cosh(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::tanh(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static KOKKOS_FUNCTION val_type asin(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::asin(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static KOKKOS_FUNCTION val_type acos(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::acos(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static KOKKOS_FUNCTION val_type atan(const val_type x) { return Kokkos::Experimental::cast_to_half( Kokkos::atan(Kokkos::Experimental::cast_from_half(x))); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + static KOKKOS_FUNCTION mag_type epsilon() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); } // Backwards compatibility with Teuchos::ScalarTraits. @@ -785,51 +1129,51 @@ class ArithTraits { static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { + static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } static std::string name() { return "half"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + static KOKKOS_FUNCTION val_type nan() { return Kokkos::Experimental::cast_to_half( Kokkos::Experimental::quiet_NaN::value); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } + static KOKKOS_FUNCTION mag_type sfmin() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_FP16_RADIX; } // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { + static KOKKOS_FUNCTION float prec() { float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; float b = (float)base(); float r = e * b; return r; } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static KOKKOS_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static KOKKOS_FUNCTION mag_type rmin() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } }; @@ -851,105 +1195,105 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { + static KOKKOS_FUNCTION val_type infinity() { return Kokkos::Experimental::cast_to_bhalf( Kokkos::Experimental::infinity::value); } - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { + static KOKKOS_FUNCTION bool isInf(const val_type x) { return Kokkos::isinf(Kokkos::Experimental::cast_from_bhalf(x)); } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { + static KOKKOS_FUNCTION bool isNan(const val_type x) { return Kokkos::isnan(Kokkos::Experimental::cast_from_bhalf(x)); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { + static KOKKOS_FUNCTION mag_type abs(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::abs(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { + static KOKKOS_FUNCTION val_type zero() { return Kokkos::Experimental::cast_to_bhalf(0.0F); } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { + static KOKKOS_FUNCTION val_type one() { return Kokkos::Experimental::cast_to_bhalf(1.0F); } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { + static KOKKOS_FUNCTION val_type min() { return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX); } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { + static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { + static KOKKOS_FUNCTION mag_type imag(const val_type) { return Kokkos::Experimental::cast_to_bhalf(0.0F); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::pow(Kokkos::Experimental::cast_from_bhalf(x), Kokkos::Experimental::cast_from_bhalf(y))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::cbrt(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { + static KOKKOS_FUNCTION val_type exp(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::exp(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { + static KOKKOS_FUNCTION val_type log(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::log(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { + static KOKKOS_FUNCTION val_type log10(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::log10(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static KOKKOS_FUNCTION val_type sin(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::sin(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static KOKKOS_FUNCTION val_type cos(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::cos(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static KOKKOS_FUNCTION val_type tan(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::tan(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::sinh(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::cosh(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::tanh(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static KOKKOS_FUNCTION val_type asin(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::asin(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static KOKKOS_FUNCTION val_type acos(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::acos(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static KOKKOS_FUNCTION val_type atan(const val_type x) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::atan(Kokkos::Experimental::cast_from_bhalf(x))); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { + static KOKKOS_FUNCTION mag_type epsilon() { // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); } @@ -963,51 +1307,51 @@ class ArithTraits { static constexpr bool isOrdinal = false; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { + static KOKKOS_FUNCTION bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { return abs(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { return conj(x); } static std::string name() { return "bhalf"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { return sqrt(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { + static KOKKOS_FUNCTION val_type nan() { return Kokkos::Experimental::cast_to_bhalf( Kokkos::Experimental::quiet_NaN::value); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { + static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } + static KOKKOS_FUNCTION mag_type sfmin() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); } - static KOKKOS_FORCEINLINE_FUNCTION int base() { + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_BF16_RADIX; } // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { + static KOKKOS_FUNCTION float prec() { float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; float b = (float)base(); float r = e * b; return r; } - static KOKKOS_FORCEINLINE_FUNCTION int t() { + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { + static KOKKOS_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { + static KOKKOS_FUNCTION mag_type rmin() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { + static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } }; @@ -1038,133 +1382,7 @@ class ArithTraits { static std::string name() { return "float"; } - static val_type zero() { - return static_cast(0.0); - } - static val_type one() { - return static_cast(1.0); - } - static val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static float infinity() { - return Kokkos::Experimental::infinity::value; - } - static val_type nan() { - return Kokkos::Experimental::quiet_NaN::value; - } - static mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - static mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; - } - static int base() { - return Kokkos::Experimental::radix::value; - } - static mag_type prec() { - return eps() * static_cast(base()); - } - static int t() { - return Kokkos::Experimental::digits::value; - } - static mag_type rnd() { return one(); } - static int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static mag_type rmin() { - return Kokkos::Experimental::norm_min::value; - } - static int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static mag_type rmax() { - return Kokkos::Experimental::finite_max< - val_type>::value; - } - - // Math Functions - static bool isInf(const val_type x) { - return Kokkos::isinf(x); - } - static bool isNan(const val_type x) { - return Kokkos::isnan(x); - } - static mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static mag_type real(const val_type x) { - return x; - } - static mag_type imag(const val_type) { - return zero(); - } - static val_type conj(const val_type x) { - return x; - } - static val_type pow(const val_type x, const val_type y) { - return Kokkos::pow(x, y); - } - static val_type sqrt(const val_type x) { - return Kokkos::sqrt(x); - } - static val_type cbrt(const val_type x) { - return Kokkos::cbrt(x); - } - static val_type exp(const val_type x) { - return Kokkos::exp(x); - } - static val_type log(const val_type x) { - return Kokkos::log(x); - } - static val_type log10(const val_type x) { - return Kokkos::log10(x); - } - static val_type sin(const val_type x) { - return Kokkos::sin(x); - } - static val_type cos(const val_type x) { - return Kokkos::cos(x); - } - static val_type tan(const val_type x) { - return Kokkos::tan(x); - } - static val_type sinh(const val_type x) { - return Kokkos::sinh(x); - } - static val_type cosh(const val_type x) { - return Kokkos::cosh(x); - } - static val_type tanh(const val_type x) { - return Kokkos::tanh(x); - } - static val_type asin(const val_type x) { - return Kokkos::asin(x); - } - static val_type acos(const val_type x) { - return Kokkos::acos(x); - } - static val_type atan(const val_type x) { - return Kokkos::atan(x); - } - - // Aliases - static bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static magnitudeType magnitude(const val_type x) { - return abs(x); - } - static val_type conjugate(const val_type x) { - return conj(x); - } - static val_type squareroot(const val_type x) { - return sqrt(x); - } - static mag_type eps() { return epsilon(); } + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) }; template <> @@ -1199,133 +1417,7 @@ class ArithTraits { static std::string name() { return "double"; } - static val_type zero() { - return static_cast(0.0); - } - static val_type one() { - return static_cast(1.0); - } - static val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static double infinity() { - return Kokkos::Experimental::infinity::value; - } - static val_type nan() { - return Kokkos::Experimental::quiet_NaN::value; - } - static mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - static mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; - } - static int base() { - return Kokkos::Experimental::radix::value; - } - static mag_type prec() { - return eps() * static_cast(base()); - } - static int t() { - return Kokkos::Experimental::digits::value; - } - static mag_type rnd() { return one(); } - static int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static mag_type rmin() { - return Kokkos::Experimental::norm_min::value; - } - static int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static mag_type rmax() { - return Kokkos::Experimental::finite_max< - val_type>::value; - } - - // Math Functions - static bool isInf(const val_type x) { - return Kokkos::isinf(x); - } - static bool isNan(const val_type x) { - return Kokkos::isnan(x); - } - static mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static mag_type real(const val_type x) { - return x; - } - static mag_type imag(const val_type) { - return zero(); - } - static val_type conj(const val_type x) { - return x; - } - static val_type pow(const val_type x, const val_type y) { - return Kokkos::pow(x, y); - } - static val_type sqrt(const val_type x) { - return Kokkos::sqrt(x); - } - static val_type cbrt(const val_type x) { - return Kokkos::cbrt(x); - } - static val_type exp(const val_type x) { - return Kokkos::exp(x); - } - static val_type log(const val_type x) { - return Kokkos::log(x); - } - static val_type log10(const val_type x) { - return Kokkos::log10(x); - } - static val_type sin(const val_type x) { - return Kokkos::sin(x); - } - static val_type cos(const val_type x) { - return Kokkos::cos(x); - } - static val_type tan(const val_type x) { - return Kokkos::tan(x); - } - static val_type sinh(const val_type x) { - return Kokkos::sinh(x); - } - static val_type cosh(const val_type x) { - return Kokkos::cosh(x); - } - static val_type tanh(const val_type x) { - return Kokkos::tanh(x); - } - static val_type asin(const val_type x) { - return Kokkos::asin(x); - } - static val_type acos(const val_type x) { - return Kokkos::acos(x); - } - static val_type atan(const val_type x) { - return Kokkos::atan(x); - } - - // Aliases - static bool isnaninf(const val_type& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const val_type x) { - return abs(x); - } - static val_type conjugate(const val_type x) { - return conj(x); - } - static val_type squareroot(const val_type x) { - return sqrt(x); - } - static mag_type eps() { return epsilon(); } + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) }; // CUDA and HIP do not support long double in device functions, @@ -1358,160 +1450,70 @@ class ArithTraits { static std::string name() { return "long double"; } - static val_type zero() { return static_cast(0.0); } - static val_type one() { return static_cast(1.0); } - static val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static long double infinity() { - return Kokkos::Experimental::infinity::value; - } - static val_type nan() { - return Kokkos::Experimental::quiet_NaN::value; - } - static mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - static mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; - } - static int base() { return Kokkos::Experimental::radix::value; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return Kokkos::Experimental::digits::value; } - static mag_type rnd() { return one(); } - static int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static mag_type rmin() { - return Kokkos::Experimental::norm_min::value; - } - static int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static mag_type rmax() { - return Kokkos::Experimental::finite_max::value; - } - - // Math Functions - static bool isInf(const val_type& x) { return Kokkos::isinf(x); } - static bool isNan(const val_type& x) { return Kokkos::isnan(x); } - static mag_type abs(const val_type& x) { return Kokkos::abs(x); } - static mag_type real(const val_type& x) { return x; } - static mag_type imag(const val_type&) { return zero(); } - static val_type conj(const val_type& x) { return x; } - static val_type pow(const val_type& x, const val_type& y) { - return Kokkos::pow(x, y); - } - static val_type sqrt(const val_type& x) { return Kokkos::sqrt(x); } - static val_type cbrt(const val_type& x) { return Kokkos::cbrtl(x); } - static val_type exp(const val_type& x) { return Kokkos::exp(x); } - static val_type log(const val_type& x) { return Kokkos::log(x); } - static val_type log10(const val_type& x) { return Kokkos::log10(x); } - static val_type sin(const val_type& x) { return Kokkos::sin(x); } - static val_type cos(const val_type& x) { return Kokkos::cos(x); } - static val_type tan(const val_type& x) { return Kokkos::tan(x); } - static val_type sinh(const val_type& x) { return Kokkos::sinh(x); } - static val_type cosh(const val_type& x) { return Kokkos::cosh(x); } - static val_type tanh(const val_type& x) { return Kokkos::tanh(x); } - static val_type asin(const val_type& x) { return Kokkos::asin(x); } - static val_type acos(const val_type& x) { return Kokkos::acos(x); } - static val_type atan(const val_type& x) { return Kokkos::atan(x); } - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type& x) { return abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static val_type squareroot(const val_type& x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } + KOKKOSKERNELS_ARITHTRAITS_REAL_FP( ) }; // long double specialization +template <> +class ArithTraits< ::Kokkos::complex > { + public: + using val_type = ::Kokkos::complex; + using mag_type = float; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; + static constexpr bool has_infinity = true; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = ::Kokkos::complex::halfPrecision>; + using doublePrecision = + ::Kokkos::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = + ArithTraits::hasMachineParameters; + + static std::string name() { return "Kokkos::complex"; } + + KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) +}; -#ifdef HAVE_KOKKOSKERNELS_QUADMATH -// CUDA does not support __float128 in device functions, so none of -// the class methods in this specialization are marked as device -// functions. template <> -class ArithTraits<__float128> { +class ArithTraits< ::Kokkos::complex > { public: - using val_type = __float128; - using mag_type = val_type; + using val_type = ::Kokkos::complex; + using mag_type = double; static constexpr bool is_specialized = true; static constexpr bool is_signed = true; static constexpr bool is_integer = false; static constexpr bool is_exact = false; - static constexpr bool is_complex = false; + static constexpr bool is_complex = true; + static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; - using halfPrecision = double; - // Unfortunately, we can't rely on a standard __float256 type. - using doublePrecision = __float128; + using halfPrecision = ::Kokkos::complex::halfPrecision>; + using doublePrecision = + ::Kokkos::complex::doublePrecision>; - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = + ArithTraits::hasMachineParameters; - static __float128 zero() { return 0.0; } - static __float128 one() { return 1.0; } - static __float128 min() { return FLT128_MIN; } - static __float128 max() { return FLT128_MAX; } - static __float128 infinity() { return 1.0q / 0.0q; } - static __float128 nan() { return strtoflt128("NAN()", NULL); } - static mag_type epsilon() { return FLT128_EPSILON; } - static mag_type sfmin() { - return FLT128_MIN; // ??? - } - static int base() { return 2; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return FLT_MANT_DIG; } - static mag_type rnd() { return 1.0; } - static int emin() { return FLT128_MIN_EXP; } - static mag_type rmin() { - return FLT128_MIN; // ??? // should be base^(emin-1) - } - static int emax() { return FLT128_MAX_EXP; } - static mag_type rmax() { - return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) - } + static std::string name() { return "Kokkos::complex"; } - // Math Functions - static bool isInf(const __float128 x) { return isinfq(x); } - static bool isNan(const __float128 x) { return isnanq(x); } - static mag_type abs(const __float128 x) { return fabsq(x); } - static mag_type real(const __float128 x) { return x; } - static mag_type imag(const __float128 /* x */) { return 0.0; } - static __float128 conj(const __float128 x) { return x; } - static __float128 pow(const __float128 x, const __float128 y) { - return powq(x, y); - } - static __float128 sqrt(const __float128 x) { return sqrtq(x); } - static __float128 cbrt(const __float128 x) { return cbrtq(x); } - static __float128 exp(const __float128 x) { return exp(x); } - static __float128 log(const __float128 x) { return logq(x); } - static __float128 log10(const __float128 x) { return log10q(x); } - static __float128 sin(const __float128 x) { return sinq(x); } - static __float128 cos(const __float128 x) { return cosq(x); } - static __float128 tan(const __float128 x) { return tanq(x); } - static __float128 sinh(const __float128 x) { return sinhq(x); } - static __float128 cosh(const __float128 x) { return coshq(x); } - static __float128 tanh(const __float128 x) { return tanhq(x); } - static __float128 asin(const __float128 x) { return asinq(x); } - static __float128 acos(const __float128 x) { return acosq(x); } - static __float128 atan(const __float128 x) { return atanq(x); } + KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) +}; - //Aliases - static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } - static magnitudeType magnitude(const __float128 x) { return abs(x); } - static __float128 conjugate(const __float128 x) { return conj(x); } - static std::string name() { return "__float128"; } - static __float128 squareroot(const __float128 x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } -}; // __float128 specialization -#endif // HAVE_KOKKOSKERNELS_QUADMATH /// \brief Partial specialization for std::complex. /// @@ -1753,351 +1755,146 @@ class ArithTraits > { static mag_type rmax() { return ArithTraits::rmax(); } }; +#if defined(KOKKOS_ENABLE_LIBQUADMATH) +// CUDA does not support __float128 in device functions, so none of +// the class methods in this specialization are marked as device +// functions. template <> -class ArithTraits< ::Kokkos::complex > { +class ArithTraits<__float128> { public: - using val_type = ::Kokkos::complex; - using mag_type = float; + using val_type = __float128; + using mag_type = val_type; static constexpr bool is_specialized = true; static constexpr bool is_signed = true; static constexpr bool is_integer = false; static constexpr bool is_exact = false; - static constexpr bool is_complex = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; - using halfPrecision = ::Kokkos::complex::halfPrecision>; - using doublePrecision = - ::Kokkos::complex::doublePrecision>; - - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = - ArithTraits::hasMachineParameters; + using halfPrecision = double; + // Unfortunately, we can't rely on a standard __float256 type. + using doublePrecision = __float128; - static std::string name() { return "Kokkos::complex"; } + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; - static val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); - } - static val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); - } - static val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); - } - static val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); + static val_type zero() { return static_cast(0.0); } + static val_type one() { return static_cast(1.0); } + static val_type min() { + return Kokkos::Experimental::finite_min::value; } - static val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); + static val_type max() { + return Kokkos::Experimental::finite_max::value; } - static val_type nan() { - return val_type(ArithTraits::nan(), ArithTraits::nan()); + static val_type infinity() { + return Kokkos::Experimental::infinity::value; } - static mag_type epsilon() { - return ArithTraits::epsilon(); + static val_type nan() { + return Kokkos::Experimental::nanq(""); } - static mag_type sfmin() { - return ArithTraits::sfmin(); + static mag_type epsilon() { + return Kokkos::Experimental::epsilon::value; } - static int base() { - return ArithTraits::base(); + static mag_type sfmin() { + return Kokkos::Experimental::norm_min::value; } - static mag_type prec() { - return ArithTraits::prec(); + static int base() { + return Kokkos::Experimental::radix::value; } - static int t() { - return ArithTraits::t(); + static mag_type prec() { + return epsilon() * static_cast(base()); } - static mag_type rnd() { - return ArithTraits::rnd(); + static int t() { + return Kokkos::Experimental::digits::value; } - static int emin() { - return ArithTraits::emin(); + static mag_type rnd() { return static_cast(1.0); } + static int emin() { + return Kokkos::Experimental::min_exponent::value; } - static mag_type rmin() { - return ArithTraits::rmin(); + static mag_type rmin() { + return Kokkos::Experimental::norm_min::value; } - static int emax() { - return ArithTraits::emax(); + static int emax() { + return Kokkos::Experimental::max_exponent::value; } - static mag_type rmax() { - return ArithTraits::rmax(); + static mag_type rmax() { + return Kokkos::Experimental::finite_max::value; + // return Kokkos::Experimental::norm_max::value; } // Math Functions - static bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static mag_type real(const val_type x) { - return x.real(); - } - static mag_type imag(const val_type x) { - return x.imag(); - } - static val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - static val_type pow (const val_type x, const - val_type y) { - return Kokkos::pow(x, y); - } - static val_type pow (const val_type x, const - mag_type y) { - return Kokkos::pow(x, y); - } - static val_type pow (const mag_type x, const - val_type y) { - return Kokkos::pow(x, y); - } - static val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); - // } - static val_type exp (const val_type x) { - return Kokkos::exp(x); - } - static val_type log (const val_type x) { - return Kokkos::log(x); - } - static val_type log10 (const val_type x) { - return Kokkos::log10(x); - } - static val_type sin (const val_type x) { - return Kokkos::sin(x); - } - static val_type cos (const val_type x) { - return Kokkos::cos(x); - } - static val_type tan (const val_type x) { - return Kokkos::tan(x); - } - static val_type sinh (const val_type x) { - return Kokkos::cosh(x); - } - static val_type cosh (const val_type x) { - return Kokkos::cosh(x); - } - static val_type tanh (const val_type x) { - return Kokkos::tanh(x); - } - static val_type asin (const val_type x) { - return Kokkos::asin(x); - } - static val_type acos (const val_type x) { - return Kokkos::acos(x); - } - static val_type atan (const val_type x) { - return Kokkos::atan(x); - } - - // Aliases - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type x) { - return abs(x); - } - static val_type conjugate(const val_type x) { - return conj(x); - } - static val_type squareroot (const val_type x) { - return sqrt (x); - } - static mag_type eps() { return epsilon(); } -}; - -template <> -class ArithTraits< ::Kokkos::complex > { - public: - using val_type = ::Kokkos::complex; - using mag_type = double; - - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool is_complex = true; - - static constexpr bool has_infinity = true; - static val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - - static bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static mag_type abs(const val_type x) { - return ::Kokkos::abs(x); + static bool isInf(const val_type x) { + return Kokkos::Experimental::isinf(x); } - static val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); + static bool isNan(const val_type x) { + return Kokkos::Experimental::isnan(x); } - static val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); + static mag_type abs(const val_type x) { + return Kokkos::Experimental::fabs(x); } - static val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); - } - static val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); - } - static mag_type real(const val_type x) { - return x.real(); - } - static mag_type imag(const val_type x) { - return x.imag(); - } - static val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - static val_type pow (const val_type x, const - val_type y) { - return Kokkos::pow(x, y); - } - static val_type pow (const val_type x, const - mag_type y) { - return Kokkos::pow(x, y); - } - static val_type pow (const mag_type x, const - val_type y) { - return Kokkos::pow(x, y); - } - static val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); + static mag_type real(const val_type x) { return x; } + static mag_type imag(const val_type /* x */) { return zero(); } + static val_type conj(const val_type x) { return x; } + // static val_type pow(const val_type x, const val_type y) { + // return Kokkos::Experimental::pow(x, y); // } - static val_type exp (const val_type x) { - return Kokkos::exp(x); + static val_type sqrt(const val_type x) { + return Kokkos::Experimental::sqrt(x); } - static val_type log (const val_type x) { - return Kokkos::log(x); + static val_type cbrt(const val_type x) { + return Kokkos::Experimental::cbrt(x); } - static val_type log10 (const val_type x) { - return Kokkos::log10(x); + static val_type exp(const val_type x) { + return Kokkos::Experimental::exp(x); } - static val_type sin (const val_type x) { - return Kokkos::sin(x); + static val_type log(const val_type x) { + return Kokkos::Experimental::log(x); } - static val_type cos (const val_type x) { - return Kokkos::cos(x); + static val_type log10(const val_type x) { + return Kokkos::Experimental::log10(x); } - static val_type tan (const val_type x) { - return Kokkos::tan(x); + static val_type sin(const val_type x) { + return Kokkos::Experimental::sin(x); } - static val_type sinh (const val_type x) { - return Kokkos::sinh(x); + static val_type cos(const val_type x) { + return Kokkos::Experimental::cos(x); } - static val_type cosh (const val_type x) { - return Kokkos::cosh(x); + static val_type tan(const val_type x) { + return Kokkos::Experimental::tan(x); } - static val_type tanh (const val_type x) { - return Kokkos::tanh(x); + static val_type sinh(const val_type x) { + return Kokkos::Experimental::sinh(x); } - static val_type asin (const val_type x) { - return Kokkos::asin(x); + static val_type cosh(const val_type x) { + return Kokkos::Experimental::cosh(x); } - static val_type acos (const val_type x) { - return Kokkos::acos(x); + static val_type tanh(const val_type x) { + return Kokkos::Experimental::tanh(x); } - static val_type atan (const val_type x) { - return Kokkos::atan(x); + static val_type asin(const val_type x) { + return Kokkos::Experimental::asin(x); } - static val_type nan() { - return val_type(ArithTraits::nan(), ArithTraits::nan()); + static val_type acos(const val_type x) { + return Kokkos::Experimental::acos(x); } - static mag_type epsilon() { - return ArithTraits::epsilon(); + static val_type atan(const val_type x) { + return Kokkos::Experimental::atan(x); } - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = ::Kokkos::complex::halfPrecision>; - using doublePrecision = - ::Kokkos::complex::doublePrecision>; - - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type x) { - return abs(x); - } - static val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "Kokkos::complex"; } - static val_type squareroot (const val_type x) { - return sqrt (x); - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return ArithTraits::sfmin(); - } - static int base() { - return ArithTraits::base(); - } - static mag_type prec() { - return ArithTraits::prec(); - } - static int t() { - return ArithTraits::t(); - } - static mag_type rnd() { - return ArithTraits::rnd(); - } - static int emin() { - return ArithTraits::emin(); - } - static mag_type rmin() { - return ArithTraits::rmin(); - } - static int emax() { - return ArithTraits::emax(); - } - static mag_type rmax() { - return ArithTraits::rmax(); - } -}; + //Aliases + static bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } + static magnitudeType magnitude(const val_type x) { return abs(x); } + static val_type conjugate(const val_type x) { return conj(x); } + static std::string name() { return "__float128"; } + static val_type squareroot(const val_type x) { return sqrt(x); } + static mag_type eps() { return epsilon(); } +}; // __float128 specialization +#endif // KOKKOS_ENABLE_LIBQUADMATH template <> class ArithTraits { @@ -2116,106 +1913,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // This avoids warnings based on whether char is signed or unsigned - return Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // C++11 defines std::sqrt for integer arguments. However, we - // currently can't assume C++11. - // - // This cast will result in no loss of accuracy, though it might - // be more expensive than it should, if we were clever about using - // bit operations. - // - // We take the absolute value first to avoid negative arguments. - // Negative real arguments to sqrt(float) return (float) NaN, but - // built-in integer types do not have an equivalent to NaN. - // Casting NaN to an integer type will thus result in some integer - // value which appears valid, but is not. We cannot raise an - // exception in device functions. Thus, we prefer to take the - // absolute value of x first, to avoid issues. Another - // possibility would be to test for a NaN output and convert it to - // some reasonable value (like 0), though this might be more - // expensive than the absolute value interpreted using the ternary - // operator. - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2226,19 +1923,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -2254,87 +1942,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2345,19 +1952,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "signed char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -2373,87 +1971,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(x)); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2464,19 +1981,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "unsigned char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) }; template <> @@ -2492,94 +2000,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - //! Integer square root returns a lower bound. - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // short doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -one(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2590,19 +2010,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -2614,225 +2025,10 @@ class ArithTraits { static constexpr bool is_specialized = true; static constexpr bool is_signed = false; static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(x)); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned short doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - using val_type = int; - using mag_type = val_type; - - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // int doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -one(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; + + static constexpr bool has_infinity = false; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2843,19 +2039,39 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + + static std::string name() { return "unsigned short"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) +}; + +template <> +class ArithTraits { + public: + using val_type = int; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = true; + static constexpr bool is_exact = true; + static constexpr bool is_complex = false; + + static constexpr bool has_infinity = false; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = val_type; + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = true; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = false; + static std::string name() { return "int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -2871,93 +2087,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(x)); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned int doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -2968,19 +2097,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "unsigned int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) }; template <> @@ -2996,87 +2116,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -one(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -3087,19 +2126,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -3115,93 +2145,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(x)); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -3212,19 +2155,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "unsigned long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) }; template <> @@ -3240,93 +2174,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(abs(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(abs(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long long doesn't implement a NaN value, but we can still have - // it return some "flag" value that can help users find use of - // uninitialized data. - return -one(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -3337,19 +2184,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) }; template <> @@ -3365,93 +2203,6 @@ class ArithTraits { static constexpr bool is_complex = false; static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return static_cast(0); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return static_cast(0); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return static_cast(1); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast(Kokkos::sqrt(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast(Kokkos::cbrt(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(Kokkos::exp(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(Kokkos::log(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(Kokkos::log10(x)); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long long doesn't implement a NaN value, but we can - // still have it return some "flag" value that can help users find - // use of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -3462,19 +2213,10 @@ class ArithTraits { static constexpr bool isOrdinal = true; static constexpr bool isComparable = true; static constexpr bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } + static std::string name() { return "unsigned long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) }; // dd_real and qd_real are floating-point types provided by the QD @@ -3492,8 +2234,12 @@ class ArithTraits { // Hence, the class methods of the ArithTraits specializations for // dd_real and qd_real are not marked as device functions. #ifdef HAVE_KOKKOS_QD +// LBV: I would like to deprecate this strange optional +// dependency on the lbnl package, is there anyone actully +// using this? It certainly is never tested by CI or nightly +// so probably does not work... template <> -struct ArithTraits { +struct [[deprecated]] ArithTraits { typedef dd_real val_type; typedef dd_real mag_type; @@ -3536,43 +2282,43 @@ struct ArithTraits { return ::log(x); } static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static KOKKOS_FUNCTION val_type tan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); #else return std::tan(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static KOKKOS_FUNCTION val_type asin(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); #else return ::asin(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static KOKKOS_FUNCTION val_type acos(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::acos(x); #else return ::acos(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static KOKKOS_FUNCTION val_type atan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::atan(x); #else @@ -3619,7 +2365,7 @@ struct ArithTraits { }; template <> -struct ArithTraits { +struct [[deprecated]] ArithTraits { typedef qd_real val_type; typedef qd_real mag_type; @@ -3662,43 +2408,43 @@ struct ArithTraits { return ::log(x); } static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { + static KOKKOS_FUNCTION val_type tan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); #else return std::tan(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { + static KOKKOS_FUNCTION val_type asin(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); #else return ::asin(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { + static KOKKOS_FUNCTION val_type acos(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::acos(x); #else return ::acos(x); #endif } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { + static KOKKOS_FUNCTION val_type atan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::atan(x); #else diff --git a/unit_test/common/Test_Common_ArithTraits.hpp b/unit_test/common/Test_Common_ArithTraits.hpp index 38a6ba7d78..f232529b94 100644 --- a/unit_test/common/Test_Common_ArithTraits.hpp +++ b/unit_test/common/Test_Common_ArithTraits.hpp @@ -1722,6 +1722,10 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { // testArithTraitsOnHost, DeviceType> (out, // verbose); +#if defined(KOKKOS_ENABLE_LIBQUADMATH) + success = success && curSuccess; + curSuccess = testArithTraitsOnHost<__float128, DeviceType>(out, verbose); +#endif return success && curSuccess; } From 5808a79059910f7ffba35044a7957652e4a8ac48 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 1 Jun 2022 17:50:40 -0600 Subject: [PATCH 169/261] ArithTraits: applying clang-format --- src/common/Kokkos_ArithTraits.hpp | 711 +++++++------------ unit_test/common/Test_Common_ArithTraits.hpp | 2 +- 2 files changed, 260 insertions(+), 453 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index bb128d32c1..7a0a9160c8 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -228,349 +228,227 @@ namespace Details { // Macro to automate the wrapping of Kokkos Mathematical Functions // in the ArithTraits struct for real floating point types, hopefully // this can be expanded to Kokkos::half_t and Kokkos::bhalf_t -#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { \ - return static_cast(0.0); \ - } \ - static FUNC_QUAL val_type one() { \ - return static_cast(1.0); \ - } \ - static FUNC_QUAL val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static FUNC_QUAL val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static FUNC_QUAL val_type infinity() { \ - return Kokkos::Experimental::infinity::value; \ - } \ - static FUNC_QUAL val_type nan() { \ - return Kokkos::Experimental::quiet_NaN::value; \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return Kokkos::Experimental::epsilon::value; \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int base() { \ - return Kokkos::Experimental::radix::value; \ - } \ - static FUNC_QUAL mag_type prec() { \ - return epsilon() * static_cast(base()); \ - } \ - static FUNC_QUAL int t() { \ - return Kokkos::Experimental::digits::value; \ - } \ - static FUNC_QUAL mag_type rnd() { return one(); } \ - static FUNC_QUAL int emin() { \ - return Kokkos::Experimental::min_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return Kokkos::Experimental::norm_min::value; \ - } \ - static FUNC_QUAL int emax() { \ - return Kokkos::Experimental::max_exponent::value; \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return Kokkos::Experimental::finite_max< \ - val_type>::value; \ - } \ - \ - static FUNC_QUAL bool isInf(const val_type x) { \ - return Kokkos::isinf(x); \ - } \ - static FUNC_QUAL bool isNan(const val_type x) { \ - return Kokkos::isnan(x); \ - } \ - static FUNC_QUAL mag_type abs(const val_type x) { \ - return Kokkos::abs(x); \ - } \ - static FUNC_QUAL mag_type real(const val_type x) { \ - return x; \ - } \ - static FUNC_QUAL mag_type imag(const val_type) { \ - return zero(); \ - } \ - static FUNC_QUAL val_type conj(const val_type x) { \ - return x; \ - } \ - static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { \ - return Kokkos::sqrt(x); \ - } \ - static FUNC_QUAL val_type cbrt(const val_type x) { \ - return Kokkos::cbrt(x); \ - } \ - static FUNC_QUAL val_type exp(const val_type x) { \ - return Kokkos::exp(x); \ - } \ - static FUNC_QUAL val_type log(const val_type x) { \ - return Kokkos::log(x); \ - } \ - static FUNC_QUAL val_type log10(const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin(const val_type x) { \ - return Kokkos::sin(x); \ - } \ - static FUNC_QUAL val_type cos(const val_type x) { \ - return Kokkos::cos(x); \ - } \ - static FUNC_QUAL val_type tan(const val_type x) { \ - return Kokkos::tan(x); \ - } \ - static FUNC_QUAL val_type sinh(const val_type x) { \ - return Kokkos::sinh(x); \ - } \ - static FUNC_QUAL val_type cosh(const val_type x) { \ - return Kokkos::cosh(x); \ - } \ - static FUNC_QUAL val_type tanh(const val_type x) { \ - return Kokkos::tanh(x); \ - } \ - static FUNC_QUAL val_type asin(const val_type x) { \ - return Kokkos::asin(x); \ - } \ - static FUNC_QUAL val_type acos(const val_type x) { \ - return Kokkos::acos(x); \ - } \ - static FUNC_QUAL val_type atan(const val_type x) { \ - return Kokkos::atan(x); \ - } \ - \ - static FUNC_QUAL bool isnaninf(const val_type x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { \ - return conj(x); \ - } \ - static FUNC_QUAL val_type squareroot(const val_type x) { \ - return sqrt(x); \ - } \ +#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0.0); } \ + static FUNC_QUAL val_type one() { return static_cast(1.0); } \ + static FUNC_QUAL val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static FUNC_QUAL val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static FUNC_QUAL val_type infinity() { \ + return Kokkos::Experimental::infinity::value; \ + } \ + static FUNC_QUAL val_type nan() { \ + return Kokkos::Experimental::quiet_NaN::value; \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return Kokkos::Experimental::epsilon::value; \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int base() { \ + return Kokkos::Experimental::radix::value; \ + } \ + static FUNC_QUAL mag_type prec() { \ + return epsilon() * static_cast(base()); \ + } \ + static FUNC_QUAL int t() { \ + return Kokkos::Experimental::digits::value; \ + } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { \ + return Kokkos::Experimental::min_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int emax() { \ + return Kokkos::Experimental::max_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return x; } \ + static FUNC_QUAL mag_type imag(const val_type) { return zero(); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL bool isnaninf(const val_type x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { \ - return val_type(ArithTraits::zero(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type one() { \ - return val_type(ArithTraits::one(), \ - ArithTraits::zero()); \ - } \ - static FUNC_QUAL val_type min() { \ - return val_type(ArithTraits::min(), \ - ArithTraits::min()); \ - } \ - static FUNC_QUAL val_type max() { \ - return val_type(ArithTraits::max(), \ - ArithTraits::max()); \ - } \ - static FUNC_QUAL val_type infinity() { \ - return val_type(ArithTraits::infinity(), \ - ArithTraits::infinity()); \ - } \ - static FUNC_QUAL val_type nan() { \ - return val_type(ArithTraits::nan(), \ - ArithTraits::nan()); \ - } \ - static FUNC_QUAL mag_type epsilon() { \ - return ArithTraits::epsilon(); \ - } \ - static FUNC_QUAL mag_type sfmin() { \ - return ArithTraits::sfmin(); \ - } \ - static FUNC_QUAL int base() { \ - return ArithTraits::base(); \ - } \ - static FUNC_QUAL mag_type prec() { \ - return ArithTraits::prec(); \ - } \ - static FUNC_QUAL int t() { \ - return ArithTraits::t(); \ - } \ - static FUNC_QUAL mag_type rnd() { \ - return ArithTraits::rnd(); \ - } \ - static FUNC_QUAL int emin() { \ - return ArithTraits::emin(); \ - } \ - static FUNC_QUAL mag_type rmin() { \ - return ArithTraits::rmin(); \ - } \ - static FUNC_QUAL int emax() { \ - return ArithTraits::emax(); \ - } \ - static FUNC_QUAL mag_type rmax() { \ - return ArithTraits::rmax(); \ - } \ - static FUNC_QUAL bool isInf(const val_type x) { \ - return ArithTraits::isInf(x.real()) || \ - ArithTraits::isInf(x.imag()); \ - } \ - static FUNC_QUAL bool isNan(const val_type x) { \ - return ArithTraits::isNan(x.real()) || \ - ArithTraits::isNan(x.imag()); \ - } \ - static FUNC_QUAL mag_type abs(const val_type x) { \ - return ::Kokkos::abs(x); \ - } \ - static FUNC_QUAL mag_type real(const val_type x) { \ - return x.real(); \ - } \ - static FUNC_QUAL mag_type imag(const val_type x) { \ - return x.imag(); \ - } \ - static FUNC_QUAL val_type conj(const val_type x) { \ - return ::Kokkos::conj(x); \ - } \ - static FUNC_QUAL val_type pow (const val_type x, const \ - val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow (const val_type x, const \ - mag_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type pow (const mag_type x, const \ - val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static FUNC_QUAL val_type sqrt(const val_type x) { \ - return ::Kokkos::sqrt(x); \ - } \ - static FUNC_QUAL val_type exp (const val_type x) { \ - return Kokkos::exp(x); \ - } \ - static FUNC_QUAL val_type log (const val_type x) { \ - return Kokkos::log(x); \ - } \ - static FUNC_QUAL val_type log10 (const val_type x) { \ - return Kokkos::log10(x); \ - } \ - static FUNC_QUAL val_type sin (const val_type x) { \ - return Kokkos::sin(x); \ - } \ - static FUNC_QUAL val_type cos (const val_type x) { \ - return Kokkos::cos(x); \ - } \ - static FUNC_QUAL val_type tan (const val_type x) { \ - return Kokkos::tan(x); \ - } \ - static FUNC_QUAL val_type sinh (const val_type x) { \ - return Kokkos::sinh(x); \ - } \ - static FUNC_QUAL val_type cosh (const val_type x) { \ - return Kokkos::cosh(x); \ - } \ - static FUNC_QUAL val_type tanh (const val_type x) { \ - return Kokkos::tanh(x); \ - } \ - static FUNC_QUAL val_type asin (const val_type x) { \ - return Kokkos::asin(x); \ - } \ - static FUNC_QUAL val_type acos (const val_type x) { \ - return Kokkos::acos(x); \ - } \ - static FUNC_QUAL val_type atan (const val_type x) { \ - return Kokkos::atan(x); \ - } \ - static FUNC_QUAL bool isnaninf(const val_type& x) { \ - return isNan(x) || isInf(x); \ - } \ - static FUNC_QUAL mag_type magnitude(const val_type x) { \ - return abs(x); \ - } \ - static FUNC_QUAL val_type conjugate(const val_type x) { \ - return conj(x); \ - } \ - static FUNC_QUAL val_type squareroot (const val_type x) { \ - return sqrt (x); \ - } \ +#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { \ + return val_type(ArithTraits::zero(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type one() { \ + return val_type(ArithTraits::one(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type min() { \ + return val_type(ArithTraits::min(), \ + ArithTraits::min()); \ + } \ + static FUNC_QUAL val_type max() { \ + return val_type(ArithTraits::max(), \ + ArithTraits::max()); \ + } \ + static FUNC_QUAL val_type infinity() { \ + return val_type(ArithTraits::infinity(), \ + ArithTraits::infinity()); \ + } \ + static FUNC_QUAL val_type nan() { \ + return val_type(ArithTraits::nan(), \ + ArithTraits::nan()); \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return ArithTraits::epsilon(); \ + } \ + static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ + static FUNC_QUAL int base() { return ArithTraits::base(); } \ + static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ + static FUNC_QUAL int t() { return ArithTraits::t(); } \ + static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ + static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ + static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ + static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ + static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return ArithTraits::isInf(x.real()) || \ + ArithTraits::isInf(x.imag()); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return ArithTraits::isNan(x.real()) || \ + ArithTraits::isNan(x.imag()); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ + static FUNC_QUAL val_type conj(const val_type x) { \ + return ::Kokkos::conj(x); \ + } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { \ + return ::Kokkos::sqrt(x); \ + } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + static FUNC_QUAL bool isnaninf(const val_type& x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -#define KOKKOSKERNELS_SIGNED_ABS \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ - return Kokkos::abs(x); \ - } \ - -#define KOKKOSKERNELS_UNSIGNED_ABS \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ - return x; \ - } \ - -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS) \ - static KOKKOS_FUNCTION val_type zero() { \ - return static_cast(0); \ - } \ - static KOKKOS_FUNCTION val_type one() { \ - return static_cast(1); \ - } \ - static KOKKOS_FUNCTION val_type min() { \ - return Kokkos::Experimental::finite_min::value; \ - } \ - static KOKKOS_FUNCTION val_type max() { \ - return Kokkos::Experimental::finite_max::value; \ - } \ - static KOKKOS_FUNCTION val_type infinity() { \ - return static_cast(0); \ - } \ - static KOKKOS_FUNCTION bool isInf(const val_type) { \ - return false; \ - } \ - static KOKKOS_FUNCTION bool isNan(const val_type) { \ - return false; \ - } \ - KOKKOSKERNELS_ABS \ - static KOKKOS_FUNCTION mag_type real(const val_type x) { \ - return x; \ - } \ - static KOKKOS_FUNCTION mag_type imag(const val_type) { \ - return zero(); \ - } \ - static KOKKOS_FUNCTION val_type conj(const val_type x) { \ - return x; \ - } \ - static KOKKOS_FUNCTION val_type pow(const val_type x, \ - const val_type y) { \ - return Kokkos::pow(x, y); \ - } \ - static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ - return static_cast(Kokkos::sqrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ - return static_cast(Kokkos::cbrt(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type exp(const val_type x) { \ - return static_cast(Kokkos::exp(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log(const val_type x) { \ - return static_cast(Kokkos::log(abs(x))); \ - } \ - static KOKKOS_FUNCTION val_type log10(const val_type x) { \ - return static_cast(Kokkos::log10(abs(x))); \ - } \ - static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ - static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ - return abs(x); \ - } \ - static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ - return conj(x); \ - } \ - static KOKKOS_FUNCTION bool isnaninf(const val_type) { \ - return false; \ - } \ - static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ - return sqrt(x); \ +#define KOKKOSKERNELS_SIGNED_ABS \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ + return Kokkos::abs(x); \ + } + +#define KOKKOSKERNELS_UNSIGNED_ABS \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { return x; } + +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS) \ + static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ + static KOKKOS_FUNCTION val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static KOKKOS_FUNCTION val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static KOKKOS_FUNCTION val_type infinity() { \ + return static_cast(0); \ + } \ + static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ + static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ + KOKKOSKERNELS_ABS \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } \ + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ + return static_cast(Kokkos::sqrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ + return static_cast(Kokkos::cbrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type exp(const val_type x) { \ + return static_cast(Kokkos::exp(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log(const val_type x) { \ + return static_cast(Kokkos::log(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log10(const val_type x) { \ + return static_cast(Kokkos::log10(abs(x))); \ + } \ + static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ + return conj(x); \ + } \ + static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ + return sqrt(x); \ } - /// \class ArithTraits /// \brief Traits class for arithmetic on type T. /// \tparam T "Scalar" type of interest @@ -1045,17 +923,10 @@ class ArithTraits { static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } - static KOKKOS_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FUNCTION mag_type imag(const val_type) { - return zero(); - } - static KOKKOS_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FUNCTION val_type pow(const val_type x, - const val_type y) { + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_half( Kokkos::pow(Kokkos::Experimental::cast_from_half(x), Kokkos::Experimental::cast_from_half(y))); @@ -1150,9 +1021,7 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type sfmin() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } - static KOKKOS_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_FP16_RADIX; - } + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_FP16_RADIX; } // Use float to allow running on both host and device static KOKKOS_FUNCTION float prec() { float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; @@ -1160,19 +1029,13 @@ class ArithTraits { float r = e * b; return r; } - static KOKKOS_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; - } + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } static KOKKOS_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; - } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } static KOKKOS_FUNCTION mag_type rmin() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); } - static KOKKOS_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; - } + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); } @@ -1222,17 +1085,12 @@ class ArithTraits { static KOKKOS_FUNCTION val_type max() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } - static KOKKOS_FUNCTION mag_type real(const val_type x) { - return x; - } + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } static KOKKOS_FUNCTION mag_type imag(const val_type) { return Kokkos::Experimental::cast_to_bhalf(0.0F); } - static KOKKOS_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FUNCTION val_type pow(const val_type x, - const val_type y) { + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { return Kokkos::Experimental::cast_to_bhalf( Kokkos::pow(Kokkos::Experimental::cast_from_bhalf(x), Kokkos::Experimental::cast_from_bhalf(y))); @@ -1328,9 +1186,7 @@ class ArithTraits { static KOKKOS_FUNCTION mag_type sfmin() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); } - static KOKKOS_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_BF16_RADIX; - } + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_BF16_RADIX; } // Use float to allow running on both host and device static KOKKOS_FUNCTION float prec() { float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; @@ -1338,19 +1194,13 @@ class ArithTraits { float r = e * b; return r; } - static KOKKOS_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; - } + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; } static KOKKOS_FUNCTION mag_type rnd() { return one(); } - static KOKKOS_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; - } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; } static KOKKOS_FUNCTION mag_type rmin() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); } - static KOKKOS_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; - } + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; } static KOKKOS_FUNCTION mag_type rmax() { return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); } @@ -1371,8 +1221,8 @@ class ArithTraits { static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = float; // Should we switch to Kokkos::half_t + using magnitudeType = mag_type; + using halfPrecision = float; // Should we switch to Kokkos::half_t using doublePrecision = double; static constexpr bool isComplex = false; @@ -1450,7 +1300,7 @@ class ArithTraits { static std::string name() { return "long double"; } - KOKKOSKERNELS_ARITHTRAITS_REAL_FP( ) + KOKKOSKERNELS_ARITHTRAITS_REAL_FP() }; // long double specialization template <> @@ -1464,7 +1314,7 @@ class ArithTraits< ::Kokkos::complex > { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = true; - static constexpr bool has_infinity = true; + static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -1514,7 +1364,6 @@ class ArithTraits< ::Kokkos::complex > { KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) }; - /// \brief Partial specialization for std::complex. /// /// The C++ Standard Library (with C++03 at least) only allows @@ -1770,7 +1619,7 @@ class ArithTraits<__float128> { static constexpr bool is_integer = false; static constexpr bool is_exact = false; static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; + static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; @@ -1794,24 +1643,16 @@ class ArithTraits<__float128> { static val_type infinity() { return Kokkos::Experimental::infinity::value; } - static val_type nan() { - return Kokkos::Experimental::nanq(""); - } + static val_type nan() { return Kokkos::Experimental::nanq(""); } static mag_type epsilon() { return Kokkos::Experimental::epsilon::value; } static mag_type sfmin() { return Kokkos::Experimental::norm_min::value; } - static int base() { - return Kokkos::Experimental::radix::value; - } - static mag_type prec() { - return epsilon() * static_cast(base()); - } - static int t() { - return Kokkos::Experimental::digits::value; - } + static int base() { return Kokkos::Experimental::radix::value; } + static mag_type prec() { return epsilon() * static_cast(base()); } + static int t() { return Kokkos::Experimental::digits::value; } static mag_type rnd() { return static_cast(1.0); } static int emin() { return Kokkos::Experimental::min_exponent::value; @@ -1828,12 +1669,8 @@ class ArithTraits<__float128> { } // Math Functions - static bool isInf(const val_type x) { - return Kokkos::Experimental::isinf(x); - } - static bool isNan(const val_type x) { - return Kokkos::Experimental::isnan(x); - } + static bool isInf(const val_type x) { return Kokkos::Experimental::isinf(x); } + static bool isNan(const val_type x) { return Kokkos::Experimental::isnan(x); } static mag_type abs(const val_type x) { return Kokkos::Experimental::fabs(x); } @@ -1849,24 +1686,14 @@ class ArithTraits<__float128> { static val_type cbrt(const val_type x) { return Kokkos::Experimental::cbrt(x); } - static val_type exp(const val_type x) { - return Kokkos::Experimental::exp(x); - } - static val_type log(const val_type x) { - return Kokkos::Experimental::log(x); - } + static val_type exp(const val_type x) { return Kokkos::Experimental::exp(x); } + static val_type log(const val_type x) { return Kokkos::Experimental::log(x); } static val_type log10(const val_type x) { return Kokkos::Experimental::log10(x); } - static val_type sin(const val_type x) { - return Kokkos::Experimental::sin(x); - } - static val_type cos(const val_type x) { - return Kokkos::Experimental::cos(x); - } - static val_type tan(const val_type x) { - return Kokkos::Experimental::tan(x); - } + static val_type sin(const val_type x) { return Kokkos::Experimental::sin(x); } + static val_type cos(const val_type x) { return Kokkos::Experimental::cos(x); } + static val_type tan(const val_type x) { return Kokkos::Experimental::tan(x); } static val_type sinh(const val_type x) { return Kokkos::Experimental::sinh(x); } @@ -1886,15 +1713,15 @@ class ArithTraits<__float128> { return Kokkos::Experimental::atan(x); } - //Aliases + // Aliases static bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } static magnitudeType magnitude(const val_type x) { return abs(x); } static val_type conjugate(const val_type x) { return conj(x); } static std::string name() { return "__float128"; } static val_type squareroot(const val_type x) { return sqrt(x); } static mag_type eps() { return epsilon(); } -}; // __float128 specialization -#endif // KOKKOS_ENABLE_LIBQUADMATH +}; // __float128 specialization +#endif // KOKKOS_ENABLE_LIBQUADMATH template <> class ArithTraits { @@ -2282,12 +2109,8 @@ struct [[deprecated]] ArithTraits { return ::log(x); } static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } static KOKKOS_FUNCTION val_type tan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); @@ -2295,15 +2118,9 @@ struct [[deprecated]] ArithTraits { return std::tan(x); #endif } - static KOKKOS_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } static KOKKOS_FUNCTION val_type asin(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); @@ -2408,12 +2225,8 @@ struct [[deprecated]] ArithTraits { return ::log(x); } static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } static KOKKOS_FUNCTION val_type tan(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::tan(x); @@ -2421,15 +2234,9 @@ struct [[deprecated]] ArithTraits { return std::tan(x); #endif } - static KOKKOS_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } static KOKKOS_FUNCTION val_type asin(const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL return sycl::asin(x); diff --git a/unit_test/common/Test_Common_ArithTraits.hpp b/unit_test/common/Test_Common_ArithTraits.hpp index f232529b94..073f879d8e 100644 --- a/unit_test/common/Test_Common_ArithTraits.hpp +++ b/unit_test/common/Test_Common_ArithTraits.hpp @@ -1723,7 +1723,7 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { // verbose); #if defined(KOKKOS_ENABLE_LIBQUADMATH) - success = success && curSuccess; + success = success && curSuccess; curSuccess = testArithTraitsOnHost<__float128, DeviceType>(out, verbose); #endif return success && curSuccess; From 0104ec1986d5614e1e0243232db69ab5dc9ef043 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 2 Jun 2022 16:59:54 -0600 Subject: [PATCH 170/261] common clean-up: removing sparse and graph features from common A lot of things in the common folder are actually purely sparse and/or graph related. This clean-up is necessary ahead of the change of directory structure and to allow modular compilation of the library. --- example/gmres/ex_real_A.cpp | 4 +- example/gmres/test_cmplx_A.cpp | 3 +- example/gmres/test_prec.cpp | 5 +- example/gmres/test_real_A.cpp | 3 +- .../sparse/KokkosSparse_wiki_gauss_seidel.cpp | 3 +- perf_test/graph/KokkosGraph_color.cpp | 5 +- perf_test/graph/KokkosGraph_color_d2.cpp | 3 +- perf_test/graph/KokkosGraph_mis_d2.cpp | 3 +- perf_test/sparse/KokkosSparse_gs.cpp | 5 +- perf_test/sparse/KokkosSparse_kk_spmv.cpp | 5 +- .../sparse/KokkosSparse_multimem_spgemm.hpp | 17 +- perf_test/sparse/KokkosSparse_pcg.cpp | 3 +- perf_test/sparse/KokkosSparse_run_spgemm.hpp | 6 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 23 +- perf_test/sparse/KokkosSparse_spadd.cpp | 4 +- perf_test/sparse/KokkosSparse_spiluk.cpp | 5 +- perf_test/sparse/KokkosSparse_spmv.cpp | 5 +- perf_test/sparse/KokkosSparse_sptrsv.cpp | 7 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 5 +- src/common/KokkosKernels_IOUtils.hpp | 1252 ---------------- src/common/KokkosKernels_Sorting.hpp | 577 -------- src/common/KokkosKernels_Utils.hpp | 2 +- src/graph/KokkosGraph_ExplicitCoarsening.hpp | 10 +- .../tpls/KokkosKernels_tpl_handles_decl.hpp | 2 +- .../tpls/KokkosKernels_tpl_handles_def.hpp | 2 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 4 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 6 +- .../KokkosKernels_Controls.hpp | 0 .../KokkosKernels_Handle.hpp | 0 src/sparse/KokkosSparse_IOUtils.hpp | 1270 +++++++++++++++++ src/sparse/KokkosSparse_SortCrs.hpp | 725 ++++++++++ .../KokkosSparse_Utils.hpp} | 0 .../KokkosSparse_Utils_cusparse.hpp} | 0 .../KokkosSparse_Utils_mkl.hpp} | 0 .../KokkosSparse_Utils_rocsparse.hpp} | 0 src/sparse/KokkosSparse_sptrsv_cholmod.hpp | 2 +- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 4 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 6 +- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 6 +- src/sparse/impl/KokkosSparse_spgemm_impl.hpp | 2 +- .../impl/KokkosSparse_spgemm_mkl_impl.hpp | 2 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 8 +- unit_test/common/Test_Common.hpp | 1 - unit_test/common/Test_Common_Sorting.hpp | 247 ---- unit_test/graph/Test_Graph_graph_color.hpp | 6 +- .../Test_Graph_graph_color_deterministic.hpp | 2 +- .../Test_Graph_graph_color_distance2.hpp | 10 +- unit_test/graph/Test_Graph_mis2.hpp | 7 +- unit_test/sparse/Test_Sparse.hpp | 2 + unit_test/sparse/Test_Sparse_SortCrs.hpp | 311 ++++ .../Test_Sparse_Transpose.hpp} | 11 +- .../sparse/Test_Sparse_Utils_cusparse.hpp | 2 +- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 7 +- unit_test/sparse/Test_Sparse_bspgemm.hpp | 13 +- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 17 +- unit_test/sparse/Test_Sparse_rocsparse.hpp | 2 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 13 +- .../sparse/Test_Sparse_spgemm_jacobi.hpp | 11 +- unit_test/sparse/Test_Sparse_spiluk.hpp | 2 +- unit_test/sparse/Test_Sparse_spmv.hpp | 9 +- unit_test/sparse/Test_Sparse_sptrsv.hpp | 2 +- unit_test/sparse/Test_Sparse_trsv.hpp | 5 +- 62 files changed, 2465 insertions(+), 2209 deletions(-) rename src/{common => sparse}/KokkosKernels_Controls.hpp (100%) rename src/{common => sparse}/KokkosKernels_Handle.hpp (100%) create mode 100644 src/sparse/KokkosSparse_IOUtils.hpp create mode 100644 src/sparse/KokkosSparse_SortCrs.hpp rename src/{common/KokkosKernels_SparseUtils.hpp => sparse/KokkosSparse_Utils.hpp} (100%) rename src/{common/KokkosKernels_SparseUtils_cusparse.hpp => sparse/KokkosSparse_Utils_cusparse.hpp} (100%) rename src/{common/KokkosKernels_SparseUtils_mkl.hpp => sparse/KokkosSparse_Utils_mkl.hpp} (100%) rename src/{common/KokkosKernels_SparseUtils_rocsparse.hpp => sparse/KokkosSparse_Utils_rocsparse.hpp} (100%) create mode 100644 unit_test/sparse/Test_Sparse_SortCrs.hpp rename unit_test/{common/Test_Common_Transpose.hpp => sparse/Test_Sparse_Transpose.hpp} (95%) diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 1e3ba19585..b3e95605f7 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -43,7 +43,7 @@ */ #include -#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -117,7 +117,7 @@ int main(int argc, char* argv[]) { { // Read in a matrix Market file and use it to test the Kokkos Operator. KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix< + KokkosSparse::Impl::read_kokkos_crst_matrix< KokkosSparse::CrsMatrix>(filename.c_str()); int n = A.numRows(); diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp index bc1ddce35b..ad8d19fb03 100644 --- a/example/gmres/test_cmplx_A.cpp +++ b/example/gmres/test_cmplx_A.cpp @@ -44,6 +44,7 @@ #include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -77,7 +78,7 @@ int main(int /*argc*/, char** /*argv[]*/) { { // Read in a matrix Market file and use it to test the Kokkos Operator. KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix< + KokkosSparse::Impl::read_kokkos_crst_matrix< KokkosSparse::CrsMatrix>(filename.c_str()); int n = A.numRows(); diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index a75c9dc59a..11122edccd 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -48,6 +48,7 @@ #include #include #include +#include "KokkosSparse_IOUtils.hpp" int main(int argc, char* argv[]) { typedef double ST; @@ -114,13 +115,13 @@ int main(int argc, char* argv[]) { { // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::kk_generate_diag_matrix< + KokkosSparse::Impl::kk_generate_diag_matrix< KokkosSparse::CrsMatrix>(n); KokkosSparse::Experimental::MatrixPrec* myPrec = new KokkosSparse::Experimental::MatrixPrec( - KokkosKernels::Impl::kk_generate_diag_matrix< + KokkosSparse::Impl::kk_generate_diag_matrix< KokkosSparse::CrsMatrix>(n, true)); ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp index 26103da035..abfb3f0101 100644 --- a/example/gmres/test_real_A.cpp +++ b/example/gmres/test_real_A.cpp @@ -44,6 +44,7 @@ #include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -89,7 +90,7 @@ int main(int /*argc*/, char** /*argv[]*/) { cOT diagDominance = 1; nnz = 10 * numRows; sp_matrix_type A = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< sp_matrix_type>(numRows, numCols, nnz, 0, ncOT(0.01 * numRows), diagDominance); diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 1fc1fc37d2..57b8ddd4ec 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -2,6 +2,7 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_gauss_seidel.hpp" @@ -37,7 +38,7 @@ int main() //Get approx. 20 entries per row //Diagonals are 2x the absolute sum of all other entries. Offset nnz = numRows * 20; - Matrix A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, 1.05 * one); + Matrix A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, 1.05 * one); std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n"; //Create a kernel handle, then a Gauss-Seidel handle with the default algorithm Handle handle; diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 8b16111157..7c6dda889f 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -55,6 +55,7 @@ #include "KokkosKernels_TestParameters.hpp" #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) { @@ -376,7 +377,7 @@ void run_multi_mem_experiment(Parameters params) { if (params.a_mem_space == 1) { fast_crstmat_t a_fast_crsmat; a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); a_fast_crsgraph = a_fast_crsmat.graph; num_cols = a_fast_crsmat.numCols(); @@ -384,7 +385,7 @@ void run_multi_mem_experiment(Parameters params) { } else { slow_crstmat_t a_slow_crsmat; a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); a_slow_crsgraph = a_slow_crsmat.graph; num_cols = a_slow_crsmat.numCols(); diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 7d6f45889a..b47fe21a70 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -65,6 +65,7 @@ #include #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -595,7 +596,7 @@ void experiment_driver(const D2Parameters& params) { using graph_t = typename crsMat_t::StaticCrsGraphType; crsMat_t A = - KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); graph_t Agraph = A.graph; int num_cols = A.numCols(); diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index c68d5f85e2..dfe7715a1d 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -66,6 +66,7 @@ #include "KokkosGraph_MIS2.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -253,7 +254,7 @@ void run_mis2(const MIS2Parameters& params) { Kokkos::Timer t; crsMat_t A_in = - KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); std::cout << "I/O time: " << t.seconds() << " s\n"; t.reset(); // Symmetrize the matrix just in case diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 3d2be67676..2136cbb640 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -52,6 +52,7 @@ #include #include #include "KokkosKernels_default_types.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -177,7 +178,7 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params) { rowmap.data(), numRows + 1)); crsMat_t A("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); - A = KokkosKernels::sort_and_merge_matrix(A); + A = KokkosSparse::sort_and_merge_matrix(A); if (params.graph_symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those // can be tested for symmetric=false) @@ -203,7 +204,7 @@ void runGS(const GS_Parameters& params) { typedef typename crsMat_t::values_type::non_const_type scalar_view_t; crsMat_t A; if (params.matrix_path) - A = KokkosKernels::Impl::read_kokkos_crst_matrix( + A = KokkosSparse::Impl::read_kokkos_crst_matrix( params.matrix_path); else A = generateLongRowMatrix(params); diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 953294b120..40887d67ec 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "KokkosKernels_default_types.hpp" @@ -74,11 +75,11 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, srand(17312837); matrix_type A; if (filename) - A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); else { Offset nnz = 10 * numRows; // note: the help text says the bandwidth is fixed at 0.01 * numRows - A = KokkosKernels::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 0, 0.01 * numRows); } numRows = A.numRows(); diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp index 371f1b1d33..78520d64eb 100644 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp @@ -44,6 +44,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_run_spgemm.hpp" +#include "KokkosSparse_IOUtils.hpp" namespace KokkosKernels { @@ -74,11 +75,11 @@ void run_multi_mem_spgemm(Parameters params) { if (params.a_mem_space == 1) { a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); } else { a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); } @@ -90,12 +91,12 @@ void run_multi_mem_spgemm(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( b_mat_file); } @@ -222,18 +223,18 @@ void run_multi_mem_spgemm(Parameters params) { if (c_mat_file != NULL) { if (params.c_mem_space == 1) { - KokkosKernels::sort_crs_matrix(c_fast_crsmat); + KokkosSparse::sort_crs_matrix(c_fast_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)(c_fast_crsmat.numRows()), (size_type)(c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), c_mat_file); } else { - KokkosKernels::sort_crs_matrix(c_slow_crsmat); + KokkosSparse::sort_crs_matrix(c_slow_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)c_slow_crsmat.numRows(), (size_type)c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 5f34ec1cd9..a98a8fcec8 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -49,6 +49,7 @@ #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #define MAXVAL 1 @@ -263,7 +264,7 @@ void run_pcg(int *cmdline, const char *mtx_file) { default_lno_t *xadj, *adj; default_scalar *ew; - KokkosKernels::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, mtx_file); diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp index caedb013c3..5ece07e403 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm.hpp @@ -44,7 +44,7 @@ #include "KokkosSparse_spgemm.hpp" #include "KokkosKernels_TestParameters.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #define TRANPOSEFIRST false #define TRANPOSESECOND false @@ -67,7 +67,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { size_t nentries2 = output_mat2.graph.entries.extent(0); size_t nvals2 = output_mat2.values.extent(0); - KokkosKernels::sort_crs_matrix(output_mat1); + KokkosSparse::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cerr << "row count is different" << std::endl; @@ -82,7 +82,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - KokkosKernels::sort_crs_matrix(output_mat2); + KokkosSparse::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index b5ac32a86e..c48066316b 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -45,7 +45,8 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_TestParameters.hpp" #include "KokkosSparse_spgemm.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosSparse_IOUtils.hpp" #define TRANSPOSEFIRST false #define TRANSPOSESECOND false @@ -69,7 +70,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { size_t nentries2 = output_mat2.graph.entries.extent(0); size_t nvals2 = output_mat2.values.extent(0); - KokkosKernels::sort_crs_matrix(output_mat1); + KokkosSparse::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cerr << "row count is different" << std::endl; @@ -84,7 +85,7 @@ bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - KokkosKernels::sort_crs_matrix(output_mat2); + KokkosSparse::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -337,11 +338,11 @@ void run_spgemm_jacobi(Parameters params) { if (params.a_mem_space == 1) { a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); } else { a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( a_mat_file); } @@ -353,12 +354,12 @@ void run_spgemm_jacobi(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( b_mat_file); } @@ -485,18 +486,18 @@ void run_spgemm_jacobi(Parameters params) { if (c_mat_file != NULL) { if (params.c_mem_space == 1) { - KokkosKernels::sort_crs_matrix(c_fast_crsmat); + KokkosSparse::sort_crs_matrix(c_fast_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)(c_fast_crsmat.numRows()), (size_type)(c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), c_mat_file); } else { - KokkosKernels::sort_crs_matrix(c_slow_crsmat); + KokkosSparse::sort_crs_matrix(c_slow_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)c_slow_crsmat.numRows(), (size_type)c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index de8b5fcca8..963ada8836 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -46,8 +46,8 @@ #include "KokkosKernels_config.h" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils_cusparse.hpp" -#include "KokkosKernels_SparseUtils_mkl.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_Utils_mkl.hpp" #include "KokkosSparse_spadd.hpp" #include "KokkosKernels_TestUtils.hpp" diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index 2ee9573880..b86ecc352f 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -58,13 +58,14 @@ #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spiluk.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_default_types.hpp" #include +#include #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \ (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) @@ -111,7 +112,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, if (!afilename.empty()) { std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl; - crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix( afilename.c_str()); // in_matrix graph_t graph = A.graph; // in_graph const size_type nrows = graph.numRows(); diff --git a/perf_test/sparse/KokkosSparse_spmv.cpp b/perf_test/sparse/KokkosSparse_spmv.cpp index 6b67905adc..9eec6181a7 100644 --- a/perf_test/sparse/KokkosSparse_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_spmv.cpp @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "KokkosKernels_default_types.hpp" #include @@ -90,12 +91,12 @@ int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test, srand(17312837); matrix_type A; if (filename) - A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); else { Offset nnz = 10 * numRows; // note: the help text says the bandwidth is fixed at 0.01 * numRows // CAVEAT: small problem sizes are problematic, b/c of 0.01*numRows - A = KokkosKernels::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 0, 0.01 * numRows); } SPMVTestData test_data = setup_test(&data, A, rows_per_thread, team_size, diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp index c6787242d9..a27ed3f6d2 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -58,12 +58,13 @@ #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_sptrsv.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_default_types.hpp" #include +#include "KokkosSparse_IOUtils.hpp" //#define INTERNAL_CUSPARSE @@ -159,7 +160,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, if (!lfilename.empty()) { std::cout << "Lower Tri Begin: Read matrix filename " << lfilename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix( lfilename.c_str()); // in_matrix graph_t graph = triMtx.graph; // in_graph const size_type nrows = graph.numRows(); @@ -567,7 +568,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, if (!ufilename.empty()) { std::cout << "Upper Tri Begin: Read matrix filename " << ufilename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix( ufilename.c_str()); // in_matrix graph_t graph = triMtx.graph; // in_graph const size_type nrows = graph.numRows(); diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index 039c88e9c1..ad8e1ba8b9 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -43,9 +43,10 @@ */ #include "Kokkos_Random.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_sptrsv.hpp" #include "KokkosSparse_sptrsv_supernode.hpp" @@ -130,7 +131,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, std::cout << " > Read a triangular-matrix filename " << matrix_filename << std::endl; host_crsmat_t M = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( matrix_filename.c_str()); const size_type nrows = M.graph.numRows(); // transpose the matrix to be stored in CCS diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index d450221797..fe72d0cbf3 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -59,7 +59,6 @@ #include #include "Kokkos_Random.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosSparse_CrsMatrix.hpp" #include namespace KokkosKernels { @@ -89,384 +88,6 @@ inline void getRandomBounds(double mag, Kokkos::complex &start, end = Kokkos::complex(mag, mag); } -// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp -// file. -template -void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, - SizeType &nnz, OrdinalType row_size_variance, - OrdinalType bandwidth, ScalarType *&values, - SizeType *&rowPtr, OrdinalType *&colInd, - OrdinalType block_elem_count = 1) { - rowPtr = new SizeType[nrows + 1]; - - OrdinalType elements_per_row = nrows ? nnz / nrows : 0; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - int numRowEntries = elements_per_row + varianz; - if (numRowEntries < 0) numRowEntries = 0; - // Clamping numRowEntries above accomplishes 2 things: - // - If ncols is 0, numRowEntries will also be 0 - // - With numRowEntries at most 2/3 the number of columns, in the worst - // case - // 90% of insertions will succeed after 6 tries - if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols; - rowPtr[row + 1] = rowPtr[row] + numRowEntries; - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) { - while (true) { - OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; - while (pos < 0) pos += ncols; - while (pos >= ncols) pos -= ncols; - - bool is_already_in_the_row = false; - for (SizeType j = rowPtr[row]; j < k; j++) { - if (colInd[j] == pos) { - is_already_in_the_row = true; - break; - } - } - if (!is_already_in_the_row) { - colInd[k] = pos; - break; - } - } - } - } - // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 - // + 50i) for complex types. - Kokkos::View valuesView( - values, nnz * block_elem_count); - ScalarType randStart, randEnd; - getRandomBounds(50.0, randStart, randEnd); - Kokkos::Random_XorShift64_Pool pool(13718); - Kokkos::fill_random(valuesView, pool, randStart, randEnd); -} - -template -void kk_sparseMatrix_generate_lower_upper_triangle( - char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz, - OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/, - ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) { - rowPtr = new SizeType[nrows + 1]; - - // OrdinalType elements_per_row = nnz/nrows; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - if (uplo == 'L') - rowPtr[row + 1] = rowPtr[row] + row + 1; - else - rowPtr[row + 1] = rowPtr[row] + ncols - (row); - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) { - if (uplo == 'L') - colInd[k] = k - rowPtr[row]; - else - colInd[k] = row + (k - rowPtr[row]); - values[k] = 1.0; - } - } -} - -template -void kk_diagonally_dominant_sparseMatrix_generate( - OrdinalType nrows, OrdinalType ncols, SizeType &nnz, - OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, - SizeType *&rowPtr, OrdinalType *&colInd, - ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { - rowPtr = new SizeType[nrows + 1]; - - OrdinalType elements_per_row = nnz / nrows; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - if (varianz < 1) varianz = 1; - if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; - rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; - if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is - rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - ScalarType total_values = 0; - std::unordered_set entriesInRow; - // We always add the diagonal entry (after this loop) - entriesInRow.insert(row); - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { - while (true) { - OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; - while (pos < 0) pos += ncols; - while (pos >= ncols) pos -= ncols; - - if (entriesInRow.find(pos) == entriesInRow.end()) { - entriesInRow.insert(pos); - colInd[k] = pos; - values[k] = 100.0 * rand() / RAND_MAX - 50.0; - total_values += - Kokkos::Details::ArithTraits::abs(values[k]); - break; - } - } - } - - colInd[rowPtr[row + 1] - 1] = row; - values[rowPtr[row + 1] - 1] = total_values * diagDominance; - } -} - -// This function creates a diagonal sparse matrix for testing matrix operations. -// The elements on the diagonal are 1, 2, ..., n-1, n. -// If "invert" is true, it will return the inverse of the above diagonal matrix. -template -crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n, - const bool invert = false) { - typedef typename crsMat_t::ordinal_type ot; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - - row_map_view_t rowmap_view("rowmap_view", n + 1); - cols_view_t columns_view("colsmap_view", n); - values_view_t values_view("values_view", n); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= n; ++i) { - hr(i) = size_type(i); - } - - for (ot i = 0; i < n; ++i) { - hc(i) = lno_t(i); - if (invert) { - hv(i) = scalar_t(1.0) / (scalar_t(i + 1)); - } else { - hv(i) = scalar_t(i + 1); - } - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", n, values_view, static_graph); - return crsmat; -} - -template -crsMat_t kk_generate_diagonally_dominant_sparse_matrix( - typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth, - typename crsMat_t::const_value_type diagDominance = - 10 * Kokkos::ArithTraits::one()) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_diagonally_dominant_sparseMatrix_generate( - nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj, - diagDominance); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsMat_t kk_generate_triangular_sparse_matrix( - char uplo, typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_sparseMatrix_generate_lower_upper_triangle( - uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - Kokkos::fence(); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsMat_t kk_generate_sparse_matrix( - typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_sparseMatrix_generate( - nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -bsrMat_t kk_generate_sparse_matrix( - typename bsrMat_t::const_ordinal_type block_dim, - typename bsrMat_t::const_ordinal_type nrows, - typename bsrMat_t::const_ordinal_type ncols, - typename bsrMat_t::non_const_size_type &nnz, - typename bsrMat_t::const_ordinal_type row_size_variance, - typename bsrMat_t::const_ordinal_type bandwidth) { - typedef KokkosSparse::CrsMatrix< - typename bsrMat_t::value_type, typename bsrMat_t::ordinal_type, - typename bsrMat_t::device_type, typename bsrMat_t::memory_traits, - typename bsrMat_t::size_type> - crsMat_t; - - const auto crs_mtx = kk_generate_sparse_matrix( - nrows * block_dim, ncols * block_dim, nnz, row_size_variance, bandwidth); - bsrMat_t bsrmat(crs_mtx, block_dim); - return bsrmat; -} -// TODO: need to fix the size_type. All over the reading inputs are lno_t. - template void md_malloc(stype **arr, size_t n, std::string /*alloc_str*/ = "") { *arr = new stype[n]; @@ -647,130 +268,6 @@ inline void kk_read_3Dview_from_file(idx_array_type &view, Kokkos::fence(); } -template -void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, - idx *lower_triangle_srcs, - idx *lower_triangle_dests) { - idx ind = 0; - for (idx i = 0; i < nv; ++i) { - idx xb = xadj[i]; - idx xe = xadj[i + 1]; - for (idx j = xb; j < xe; ++j) { - idx dst = adj[j]; - if (i < dst) { - lower_triangle_srcs[ind] = i; - lower_triangle_dests[ind++] = dst; - } - } - } -} - -template -void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) { - for (idx i = 0; i < nv; ++i) { - idx xb = xadj[i]; - idx xe = xadj[i + 1]; - for (idx j = xb; j < xe; ++j) { - srcs[j] = i; - } - } -} - -template -void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests, - wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) { - std::vector> edges(ne); - for (size_type i = 0; i < ne; ++i) { - edges[i].src = srcs[i]; - edges[i].dst = dests[i]; - edges[i].ew = ew[i]; - } - std::sort(edges.begin(), edges.begin() + ne); - - size_type eind = 0; - for (lno_t i = 0; i < nv; ++i) { - (xadj)[i] = eind; - while (edges[eind].src == i) { - (adj)[eind] = edges[eind].dst; - (*crs_ew)[eind] = edges[eind].ew; - ++eind; - } - } - xadj[nv] = eind; -} - -template -void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs, - in_lno_t *dests, size_type *xadj, - lno_t *adj) { - std::vector> edges(ne * 2); - for (size_type i = 0; i < ne; ++i) { - edges[i * 2].src = srcs[i]; - edges[i * 2].dst = dests[i]; - - edges[i * 2 + 1].src = dests[i]; - edges[i * 2 + 1].dst = srcs[i]; - } -#ifdef KOKKOSKERNELS_HAVE_OUTER -#include -#include -#include -#include - __gnu_parallel::parallel_sort_mwms *>( - &(edges[0]), &(edges[0]) + ne * 2, - std::less>(), 64); -#else - std::sort(edges.begin(), edges.begin() + ne * 2); -#endif - - size_type eind = 0; - for (lno_t i = 0; i < nv; ++i) { - (xadj)[i] = eind; - while (edges[eind].src == i) { - (adj)[eind] = edges[eind].dst; - //(*crs_ew)[eind] = edges[eind].ew; - ++eind; - } - } - xadj[nv] = eind; -} -/* - -template -void read_graph_src_dst_bin( - lno_t *nv, size_type *ne - ,size_type **xadj, lno_t **adj, scalar_t **ew, - const char *fnameSrc, const char *fnameTarg){ - - size_t numEdges = 0; - size_t *srcs, *dst; //this type is hard coded - buildEdgeListFromBinSrcTarg_undirected( - fnameSrc, fnameTarg, - &numEdges, - &srcs, &dst); - - lno_t num_vertex = 0; - for (size_t i = 0; i < numEdges; ++i){ - if (num_vertex < srcs[i]) num_vertex = srcs[i]; - if (num_vertex < dst[i]) num_vertex = dst[i]; - } - num_vertex += 1; - - *nv = num_vertex; - *ne = numEdges * 2; - - md_malloc(xadj, num_vertex + 1); - md_malloc(adj, numEdges * 2); - convert_undirected_edge_list_to_csr ( - num_vertex, numEdges, - srcs, dst, - *xadj, *adj); - - delete [] srcs; - delete [] dst; -} -*/ - template void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, const wt *ew, const char *filename) { @@ -797,270 +294,6 @@ void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, myFile.close(); } -template -void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename, std::ios::out | std::ios::binary); - myFile.write((char *)&nv, sizeof(lno_t)); - myFile.write((char *)&ne, sizeof(size_type)); - myFile.write((char *)xadj, sizeof(size_type) * (nv + 1)); - - myFile.write((char *)adj, sizeof(lno_t) * (ne)); - - myFile.write((char *)ew, sizeof(scalar_t) * (ne)); - - myFile.close(); -} - -template -void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename, std::ios::out); - myFile << nv << " " << ne << std::endl; - - for (lno_t i = 0; i <= nv; ++i) { - myFile << xadj[i] << " "; - } - myFile << std::endl; - - for (lno_t i = 0; i < nv; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << adj[j] << " "; - } - myFile << std::endl; - } - for (size_type i = 0; i < ne; ++i) { - myFile << ew[i] << " "; - } - myFile << std::endl; - - myFile.close(); -} - -template -void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t * /*ew*/, - const char *filename) { - std::ofstream ff(filename); - ff << "AdjacencyGraph" << std::endl; - ff << nv << std::endl << ne << std::endl; - for (lno_t i = 0; i < nv; ++i) { - ff << xadj[i] << std::endl; - } - for (size_type i = 0; i < ne; ++i) { - ff << adj[i] << std::endl; - } - ff.close(); -} - -// MM: types and utility functions for parsing the MatrixMarket format -namespace MM { -enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR }; -enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY }; -enum MtxField { - UNDEFINED_FIELD, - REAL, // includes both float and double - COMPLEX, // includes complex and complex - INTEGER, // includes all integer types - PATTERN // not a type, but means the value for every entry is 1 -}; -enum MtxSym { - UNDEFINED_SYMMETRY, - GENERAL, - SYMMETRIC, // A(i, j) = A(j, i) - SKEW_SYMMETRIC, // A(i, j) = -A(j, i) - HERMITIAN // A(i, j) = a + bi; A(j, i) = a - bi -}; - -// readScalar/writeScalar: read and write a scalar in the form that it appears -// in an .mtx file. The >> and << operators won't work, because complex appears -// as "real imag", not "(real, imag)" -template -scalar_t readScalar(std::istream &is) { - scalar_t val; - is >> val; - return val; -} - -template <> -inline Kokkos::complex readScalar(std::istream &is) { - float r, i; - is >> r; - is >> i; - return Kokkos::complex(r, i); -} - -template <> -inline Kokkos::complex readScalar(std::istream &is) { - double r, i; - is >> r; - is >> i; - return Kokkos::complex(r, i); -} - -template -void writeScalar(std::ostream &os, scalar_t val) { - os << val; -} - -template <> -inline void writeScalar(std::ostream &os, Kokkos::complex val) { - os << val.real() << ' ' << val.imag(); -} - -template <> -inline void writeScalar(std::ostream &os, Kokkos::complex val) { - os << val.real() << ' ' << val.imag(); -} - -// symmetryFlip: given a value for A(i, j), return the value that -// should be inserted at A(j, i) (if any) -template -scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) { - if (symFlag == SKEW_SYMMETRIC) return -val; - return val; -} - -template <> -inline Kokkos::complex symmetryFlip(Kokkos::complex val, - MtxSym symFlag) { - if (symFlag == HERMITIAN) - return Kokkos::conj(val); - else if (symFlag == SKEW_SYMMETRIC) - return -val; - return val; -} - -template <> -inline Kokkos::complex symmetryFlip(Kokkos::complex val, - MtxSym symFlag) { - if (symFlag == HERMITIAN) - return Kokkos::conj(val); - else if (symFlag == SKEW_SYMMETRIC) - return -val; - return val; -} -} // namespace MM - -template -void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, - const size_type *xadj, const lno_t *adj, - const scalar_t *vals, const char *filename) { - std::ofstream myFile(filename); - myFile << "%%MatrixMarket matrix coordinate "; - if (std::is_same>::value || - std::is_same>::value) - myFile << "complex"; - else - myFile << "real"; - myFile << " general\n"; - myFile << nrows << " " << ncols << " " << nentries << '\n'; - myFile << std::setprecision(17) << std::scientific; - for (lno_t i = 0; i < nrows; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << i + 1 << " " << adj[j] + 1 << " "; - MM::writeScalar(myFile, vals[j]); - myFile << '\n'; - } - } - myFile.close(); -} - -template -void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename); - myFile << "%%MatrixMarket matrix coordinate "; - if (std::is_same>::value || - std::is_same>::value) - myFile << "complex"; - else - myFile << "real"; - myFile << " general\n"; - myFile << nv << " " << nv << " " << ne << '\n'; - myFile << std::setprecision(8) << std::scientific; - for (lno_t i = 0; i < nv; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << i + 1 << " " << (adj)[j] + 1 << " "; - MM::writeScalar(myFile, ew[j]); - myFile << '\n'; - } - } - - myFile.close(); -} - -template -void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::ifstream myFile(filename, std::ios::in | std::ios::binary); - - myFile.read((char *)nv, sizeof(lno_t)); - myFile.read((char *)ne, sizeof(size_type)); - md_malloc(xadj, *nv + 1); - md_malloc(adj, *ne); - md_malloc(ew, *ne); - myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1)); - myFile.read((char *)*adj, sizeof(lno_t) * (*ne)); - myFile.read((char *)*ew, sizeof(scalar_t) * (*ne)); - myFile.close(); -} - -// When Kokkos issue #2313 is resolved, can delete -// parseScalar and just use operator>> -template -scalar_t parseScalar(std::istream &is) { - scalar_t val; - is >> val; - return val; -} - -template <> -inline Kokkos::complex parseScalar(std::istream &is) { - std::complex val; - is >> val; - return Kokkos::complex(val); -} - -template <> -inline Kokkos::complex parseScalar(std::istream &is) { - std::complex val; - is >> val; - return Kokkos::complex(val); -} - -template -void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::ifstream myFile(filename, std::ios::in); - myFile >> *nv >> *ne; - - md_malloc(xadj, *nv + 1); - md_malloc(adj, *ne); - md_malloc(ew, *ne); - - for (lno_t i = 0; i <= *nv; ++i) { - myFile >> (*xadj)[i]; - } - - for (size_type i = 0; i < *ne; ++i) { - myFile >> (*adj)[i]; - } - for (size_type i = 0; i < *ne; ++i) { - (*ew)[i] = parseScalar(myFile); - } - myFile.close(); -} - inline bool endswith(std::string const &fullString, std::string const &ending) { if (fullString.length() >= ending.length()) { return (0 == fullString.compare(fullString.length() - ending.length(), @@ -1070,491 +303,6 @@ inline bool endswith(std::string const &fullString, std::string const &ending) { } } -template -void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) { - typedef typename crs_matrix_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crs_matrix_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::value_type offset_t; - typedef typename cols_view_t::value_type lno_t; - typedef typename values_view_t::value_type scalar_t; - typedef typename values_view_t::size_type size_type; - - size_type nnz = a_crsmat.nnz(); - - auto a_rowmap_view = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), a_crsmat.graph.row_map); - auto a_entries_view = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), a_crsmat.graph.entries); - auto a_values_view = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values); - offset_t *a_rowmap = const_cast(a_rowmap_view.data()); - lno_t *a_entries = a_entries_view.data(); - scalar_t *a_values = a_values_view.data(); - - std::string strfilename(filename); - if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) { - write_matrix_mtx( - a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap, - a_entries, a_values, filename); - return; - } else if (a_crsmat.numRows() != a_crsmat.numCols()) { - throw std::runtime_error( - "For formats other than MatrixMarket (suffix .mm or .mtx),\n" - "write_kokkos_crst_matrix only supports square matrices"); - } - if (endswith(strfilename, ".bin")) { - write_graph_bin( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else if (endswith(strfilename, ".ligra")) { - write_graph_ligra( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else if (endswith(strfilename, ".crs")) { - write_graph_crs( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else { - std::string errMsg = - std::string("write_kokkos_crst_matrix: File extension on ") + filename + - " does not correspond to a known format"; - throw std::runtime_error(errMsg); - } -} - -template -int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, - size_type **xadj, lno_t **adj, scalar_t **ew, - bool symmetrize = false, bool remove_diagonal = true, - bool transpose = false) { - using namespace MM; - std::ifstream mmf(fileName, std::ifstream::in); - if (!mmf.is_open()) { - throw std::runtime_error("File cannot be opened\n"); - } - - std::string fline = ""; - getline(mmf, fline); - - if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') { - throw std::runtime_error("Invalid MM file. Line-1\n"); - } - - // make sure every required field is in the file, by initializing them to - // UNDEFINED_* - MtxObject mtx_object = UNDEFINED_OBJECT; - MtxFormat mtx_format = UNDEFINED_FORMAT; - MtxField mtx_field = UNDEFINED_FIELD; - MtxSym mtx_sym = UNDEFINED_SYMMETRY; - - if (fline.find("matrix") != std::string::npos) { - mtx_object = MATRIX; - } else if (fline.find("vector") != std::string::npos) { - mtx_object = VECTOR; - throw std::runtime_error( - "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()"); - } - - if (fline.find("coordinate") != std::string::npos) { - // sparse - mtx_format = COORDINATE; - } else if (fline.find("array") != std::string::npos) { - // dense - mtx_format = ARRAY; - } - - if (fline.find("real") != std::string::npos || - fline.find("double") != std::string::npos) { - if (std::is_same::value || - std::is_same::value) - mtx_field = REAL; - else { - if (!std::is_floating_point::value) - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with float or double typed " - "MatrixMarket file."); - else - mtx_field = REAL; - } - } else if (fline.find("complex") != std::string::npos) { - if (!(std::is_same>::value || - std::is_same>::value)) - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket " - "file."); - else - mtx_field = COMPLEX; - } else if (fline.find("integer") != std::string::npos) { - if (std::is_integral::value || - std::is_floating_point::value || - std::is_same::value || - std::is_same::value) - mtx_field = INTEGER; - else - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket " - "file."); - } else if (fline.find("pattern") != std::string::npos) { - mtx_field = PATTERN; - // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so - // nothing to check here - } - - if (fline.find("general") != std::string::npos) { - mtx_sym = GENERAL; - } else if (fline.find("skew-symmetric") != std::string::npos) { - mtx_sym = SKEW_SYMMETRIC; - } else if (fline.find("symmetric") != std::string::npos) { - // checking for "symmetric" after "skew-symmetric" because it's a substring - mtx_sym = SYMMETRIC; - } else if (fline.find("hermitian") != std::string::npos || - fline.find("Hermitian") != std::string::npos) { - mtx_sym = HERMITIAN; - } - // Validate the matrix attributes - if (mtx_format == ARRAY) { - if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL; - if (mtx_sym != GENERAL) - throw std::runtime_error( - "array format MatrixMarket file must have general symmetry (optional " - "to include \"general\")"); - } - if (mtx_object == UNDEFINED_OBJECT) - throw std::runtime_error( - "MatrixMarket file header is missing the object type."); - if (mtx_format == UNDEFINED_FORMAT) - throw std::runtime_error("MatrixMarket file header is missing the format."); - if (mtx_field == UNDEFINED_FIELD) - throw std::runtime_error( - "MatrixMarket file header is missing the field type."); - if (mtx_sym == UNDEFINED_SYMMETRY) - throw std::runtime_error( - "MatrixMarket file header is missing the symmetry type."); - - while (1) { - getline(mmf, fline); - if (fline[0] != '%') break; - } - std::stringstream ss(fline); - lno_t nr = 0, nc = 0; - size_type nnz = 0; - ss >> nr >> nc; - if (mtx_format == COORDINATE) - ss >> nnz; - else - nnz = nr * nc; - size_type numEdges = nnz; - symmetrize = symmetrize || mtx_sym != GENERAL; - if (symmetrize && nr != nc) { - throw std::runtime_error("A non-square matrix cannot be symmetrized."); - } - if (mtx_format == ARRAY) { - // Array format only supports general symmetry and non-pattern - if (symmetrize) - throw std::runtime_error( - "array format MatrixMarket file cannot be symmetrized."); - if (mtx_field == PATTERN) - throw std::runtime_error( - "array format MatrixMarket file can't have \"pattern\" field type."); - } - if (symmetrize) { - numEdges = 2 * nnz; - } - // numEdges is only an upper bound (diagonal entries may be removed) - std::vector> edges(numEdges); - size_type nE = 0; - lno_t numDiagonal = 0; - for (size_type i = 0; i < nnz; ++i) { - getline(mmf, fline); - std::stringstream ss2(fline); - struct Edge tmp; - // read source, dest (edge) and weight (value) - lno_t s, d; - scalar_t w; - if (mtx_format == ARRAY) { - // In array format, entries are listed in column major order, - // so the row and column can be determined just from the index i - //(but make them 1-based indices, to match the way coordinate works) - s = i % nr + 1; // row - d = i / nr + 1; // col - } else { - // In coordinate format, row and col of each entry is read from file - ss2 >> s >> d; - } - if (mtx_field == PATTERN) - w = 1; - else - w = readScalar(ss2); - if (!transpose) { - tmp.src = s - 1; - tmp.dst = d - 1; - tmp.ew = w; - } else { - tmp.src = d - 1; - tmp.dst = s - 1; - tmp.ew = w; - } - if (tmp.src == tmp.dst) { - numDiagonal++; - if (!remove_diagonal) { - edges[nE++] = tmp; - } - continue; - } - edges[nE++] = tmp; - if (symmetrize) { - struct Edge tmp2; - tmp2.src = tmp.dst; - tmp2.dst = tmp.src; - // the symmetrized value is w, -w or conj(w) if mtx_sym is - // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively. - tmp2.ew = symmetryFlip(tmp.ew, mtx_sym); - edges[nE++] = tmp2; - } - } - mmf.close(); - std::sort(edges.begin(), edges.begin() + nE); - if (transpose) { - lno_t tmp = nr; - nr = nc; - nc = tmp; - } - // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt - *nrows = nr; - *ncols = nc; - *ne = nE; - //*xadj = new idx[nr + 1]; - md_malloc(xadj, nr + 1); - //*adj = new idx[nE]; - md_malloc(adj, nE); - //*ew = new wt[nE]; - md_malloc(ew, nE); - size_type eind = 0; - size_type actual = 0; - for (lno_t i = 0; i < nr; ++i) { - (*xadj)[i] = actual; - bool is_first = true; - while (eind < nE && edges[eind].src == i) { - if (is_first || !symmetrize || eind == 0 || - (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) { - (*adj)[actual] = edges[eind].dst; - (*ew)[actual] = edges[eind].ew; - ++actual; - } - is_first = false; - ++eind; - } - } - (*xadj)[nr] = actual; - *ne = actual; - return 0; -} - -// Version of read_mtx which does not capture the number of columns. -// This is the old interface; it's kept for backwards compatibility. -template -int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj, - lno_t **adj, scalar_t **ew, bool symmetrize = false, - bool remove_diagonal = true, bool transpose = false) { - lno_t ncol; // will discard - return read_mtx(fileName, nv, &ncol, ne, xadj, - adj, ew, symmetrize, - remove_diagonal, transpose); -} - -template -void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::string strfilename(filename); - if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) { - read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); - } - - else if (endswith(strfilename, ".bin")) { - read_graph_bin(nv, ne, xadj, adj, ew, filename); - } - - else if (endswith(strfilename, ".crs")) { - read_graph_crs(nv, ne, xadj, adj, ew, filename); - } - - else { - throw std::runtime_error("Reader is not available\n"); - } -} - -template -crsMat_t read_kokkos_crst_matrix(const char *filename_) { - std::string strfilename(filename_); - bool isMatrixMarket = - endswith(strfilename, ".mtx") || endswith(strfilename, ".mm"); - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::value_type size_type; - typedef typename cols_view_t::value_type lno_t; - typedef typename values_view_t::value_type scalar_t; - - lno_t nr, nc, *adj; - size_type *xadj, nnzA; - scalar_t *values; - - if (isMatrixMarket) { - // MatrixMarket file contains the exact number of columns - read_mtx(filename_, &nr, &nc, &nnzA, &xadj, - &adj, &values, false, false, false); - } else { - //.crs and .bin files don't contain #cols, so will compute it later based on - // the entries - read_matrix(&nr, &nnzA, &xadj, &adj, &values, - filename_); - } - - row_map_view_t rowmap_view("rowmap_view", nr + 1); - cols_view_t columns_view("colsmap_view", nnzA); - values_view_t values_view("values_view", nnzA); - - { - Kokkos::View> - hr(xadj, nr + 1); - Kokkos::View> - hc(adj, nnzA); - Kokkos::View> - hv(values, nnzA); - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - if (!isMatrixMarket) { - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, nc); - nc++; - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsGraph_t read_kokkos_crst_graph(const char *filename_) { - typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsGraph_t::entries_type::non_const_type cols_view_t; - - typedef typename row_map_view_t::value_type size_type; - typedef typename cols_view_t::value_type lno_t; - typedef double scalar_t; - - lno_t nv, *adj; - size_type *xadj, nnzA; - scalar_t *values; - read_matrix(&nv, &nnzA, &xadj, &adj, &values, - filename_); - - row_map_view_t rowmap_view("rowmap_view", nv + 1); - cols_view_t columns_view("colsmap_view", nnzA); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - - for (lno_t i = 0; i <= nv; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnzA; ++i) { - hc(i) = adj[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - } - - lno_t ncols = 0; - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, ncols); - ncols += 1; - - crsGraph_t static_graph(columns_view, rowmap_view, ncols); - delete[] xadj; - delete[] adj; - delete[] values; - return static_graph; -} - -template -inline void kk_sequential_create_incidence_matrix( - nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj, - size_type *i_adj // output. preallocated -) { - std::vector c_xadj(num_rows); - for (nnz_lno_t i = 0; i < num_rows; i++) { - c_xadj[i] = xadj[i]; - } - int eCnt = 0; - for (nnz_lno_t i = 0; i < num_rows; i++) { - size_type begin = xadj[i]; - size_type end = xadj[i + 1]; - nnz_lno_t adjsize = end - begin; - - for (nnz_lno_t j = 0; j < adjsize; j++) { - size_type aind = j + begin; - nnz_lno_t col = adj[aind]; - if (i < col) { - i_adj[c_xadj[i]++] = eCnt; - i_adj[c_xadj[col]++] = eCnt++; - } - } - } - - for (nnz_lno_t i = 0; i < num_rows; i++) { - if (c_xadj[i] != xadj[i + 1]) { - std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i] - << " xadj[i+1]:" << xadj[i + 1] << std::endl; - } - } -} - -template -inline void kk_sequential_create_incidence_matrix_transpose( - const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj, - const nnz_lno_t *adj, - size_type *i_xadj, // output. preallocated - nnz_lno_t *i_adj // output. preallocated -) { - for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) { - i_xadj[i] = i * 2; - } - int eCnt = 0; - for (nnz_lno_t i = 0; i < num_rows; i++) { - size_type begin = xadj[i]; - size_type end = xadj[i + 1]; - nnz_lno_t adjsize = end - begin; - - for (nnz_lno_t j = 0; j < adjsize; j++) { - size_type aind = j + begin; - nnz_lno_t col = adj[aind]; - if (i < col) { - i_adj[eCnt++] = i; - i_adj[eCnt++] = col; - } - } - } -} - } // namespace Impl } // namespace KokkosKernels diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 208688ae5b..8b897047d9 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -61,48 +61,6 @@ struct DefaultComparator { }; } // namespace Impl -// ---------------------------------- -// BSR matrix/graph sorting utilities -// ---------------------------------- - -template -void sort_bsr_matrix(const bsrMat_t& A); - -// ---------------------------------- -// CRS matrix/graph sorting utilities -// ---------------------------------- - -// The sort_crs* functions sort the adjacent column list for each row into -// ascending order. - -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values); - -template -void sort_crs_matrix(const crsMat_t& A); - -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); - -template -void sort_crs_graph(const crsGraph_t& G); - -// sort_and_merge_matrix produces a new matrix which is equivalent to A but is -// sorted and has no duplicated entries: each (i, j) is unique. Values for -// duplicated entries are summed. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A); - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G); - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out); - // ---------------------------- // General device-level sorting // ---------------------------- @@ -155,240 +113,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( namespace Impl { -template -struct SortCrsMatrixFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - using values_managed_t = Kokkos::View; - - SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, - const entries_t& entries_, const values_t& values_) - : rowmap(rowmap_), entries(entries_), values(values_) { - if (usingRangePol) { - entriesAux = entries_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), - entries.extent(0)); - valuesAux = values_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), - values.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort2( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, - valuesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort2( - entries.data() + rowStart, values.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; - values_t values; - values_managed_t valuesAux; -}; - -template -struct SortCrsGraphFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - - SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, - const entries_t& entries_) - : rowmap(rowmap_), entries(entries_) { - if (usingRangePol) { - entriesAux = entries_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), - entries.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort( - entries.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; -}; - -template -struct MergedRowmapFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using c_rowmap_t = typename rowmap_t::const_type; - - // Precondition: entries are sorted within each row - MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, - const entries_t& entries_) - : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with - mergedCounts(row) = 0; - return; - } - // Otherwise, the first entry in the row exists - lno_t uniqueEntries = 1; - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (entries(j - 1) != entries(j)) uniqueEntries++; - } - mergedCounts(row) = uniqueEntries; - lnewNNZ += uniqueEntries; - if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; - } - - rowmap_t mergedCounts; - c_rowmap_t rowmap; - entries_t entries; -}; - -template -struct MatrixMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - - // Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const values_t& values_, - const rowmap_t& mergedRowmap_, - const entries_t& mergedEntries_, - const values_t& mergedValues_) - : rowmap(rowmap_), - entries(entries_), - values(values_), - mergedRowmap(mergedRowmap_), - mergedEntries(mergedEntries_), - mergedValues(mergedValues_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - scalar_t accumVal = values(rowBegin); - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol == entries(j)) { - // accumulate - accumVal += values(j); - } else { - // write out and reset - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - insertPos++; - accumVal = values(j); - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - values_t values; - rowmap_t mergedRowmap; - entries_t mergedEntries; - values_t mergedValues; -}; - -template -struct GraphMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - - // Precondition: entries are sorted within each row - GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const rowmap_t& mergedRowmap_, - const entries_t& mergedEntries_) - : rowmap(rowmap_), - entries(entries_), - mergedRowmap(mergedRowmap_), - mergedEntries(mergedEntries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol != entries(j)) { - // write out and reset - mergedEntries(insertPos) = accumCol; - insertPos++; - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - rowmap_t mergedRowmap; - entries_t mergedEntries; -}; - // Functor that sorts a view on one team template @@ -524,274 +248,6 @@ struct BitonicPhase2Functor { } // namespace Impl -// Sort a CRS matrix: within each row, sort entries ascending by column. -// At the same time, permute the values. -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsMatrixFunctor - funct(useRadix, rowmap, entries, values); - if (useRadix) { - Kokkos::parallel_for("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), - funct); - } else { - // Try to get teamsize to be largest power of 2 not greater than avg entries - // per row - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); - } -} - -template -void sort_crs_matrix(const crsMat_t& A) { - // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using exec_space = typename crsMat_t::execution_space; - // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the - // entries and CrsMatrix values are non-const (so sorting them directly - // is allowed) - sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); -} - -namespace Impl { - -template -KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { - T t = a; - a = b; - b = t; -} - -template -struct sort_bsr_functor { - using lno_t = typename entries_type::non_const_value_type; - - row_map_type rowmap; - entries_type entries; - values_type values; - const lno_t blocksize; - - sort_bsr_functor(row_map_type rowmap_, entries_type entries_, - values_type values_, const lno_t blocksize_) - : rowmap(rowmap_), - entries(entries_), - values(values_), - blocksize(blocksize_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const lno_t i) const { - const lno_t rowStart = rowmap(i); - const lno_t rowSize = rowmap(i + 1) - rowStart; - auto* e = entries.data() + rowStart; - auto* v = values.data() + rowStart * blocksize; - bool done = false; - while (!done) { - done = true; - for (lno_t j = 1; j < rowSize; ++j) { - const lno_t jp = j - 1; - if (e[jp] <= e[j]) continue; - Impl::kk_swap(e[jp], e[j]); - auto const vb = v + j * blocksize; - auto const vbp = v + jp * blocksize; - for (lno_t k = 0; k < blocksize; - ++k) // std::swap_ranges(vb, vb + blocksize, vbp); - Impl::kk_swap(vb[k], vbp[k]); - done = false; - } - } - } -}; - -} // namespace Impl - -// Sort a BRS matrix: within each row, sort entries ascending by column and -// permute the values accordingly. -template -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values) { - // TODO: this is O(N^2) mock for debugging - do regular implementation based - // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general - // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - const lno_t blocksize = blockdim * blockdim; - - assert(values.extent(0) == entries.extent(0) * blocksize); - Impl::sort_bsr_functor bsr_sorter( - rowmap, entries, values, blocksize); - Kokkos::parallel_for("sort_bsr_matrix", - Kokkos::RangePolicy(0, numRows), - bsr_sorter); -} - -// Sort a BSR matrix (like CRS but single values are replaced with contignous -// blocks) -template -void sort_bsr_matrix(const bsrMat_t& A) { - // NOTE: unlike rowmap, entries and values are non-const, so we can sort them - // directly - sort_bsr_matrix( - A.blockDim(), A.graph.row_map, A.graph.entries, A.values); -} - -// Sort a CRS graph: within each row, sort entries ascending by column. -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsGraphFunctor funct( - useRadix, rowmap, entries); - if (useRadix) { - Kokkos::parallel_for("sort_crs_graph", - Kokkos::RangePolicy(0, numRows), - funct); - } else { - // Try to get teamsize to be largest power of 2 less than or equal to - // half the entries per row. 0.5 * #entries is bitonic's parallelism within - // a row. - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); - } -} - -template -void sort_crs_graph(const crsGraph_t& G) { - static_assert( - !std::is_const::value, - "sort_crs_graph requires StaticCrsGraph entries to be non-const."); - sort_crs_graph(G.row_map, G.entries); -} - -// Sort the rows of matrix, and merge duplicate entries. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - using c_rowmap_t = typename crsMat_t::row_map_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using size_type = typename rowmap_t::non_const_value_type; - using exec_space = typename crsMat_t::execution_space; - using range_t = Kokkos::RangePolicy; - sort_crs_matrix(A); - // Count entries per row into a new rowmap, in terms of merges that can be - // done - rowmap_t mergedRowmap( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - A.numRows() + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, A.numRows()), - Impl::MergedRowmapFunctor( - mergedRowmap, A.graph.row_map, A.graph.entries), - numCompressedEntries); - // Prefix sum to get rowmap - Impl::kk_exclusive_parallel_prefix_sum(A.numRows() + 1, - mergedRowmap); - entries_t mergedEntries("SortedMerged entries", numCompressedEntries); - values_t mergedValues("SortedMerged values", numCompressedEntries); - // Compute merged entries and values - Kokkos::parallel_for( - range_t(0, A.numRows()), - Impl::MatrixMergedEntriesFunctor( - A.graph.row_map, A.graph.entries, A.values, mergedRowmap, - mergedEntries, mergedValues)); - // Finally, construct the new compressed matrix - return crsMat_t("SortedMerged", A.numRows(), A.numCols(), - numCompressedEntries, mergedValues, mergedRowmap, - mergedEntries); -} - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using range_t = Kokkos::RangePolicy; - using const_rowmap_t = typename rowmap_t::const_type; - lno_t numRows = rowmap_in.extent(0); - if (numRows <= 1) { - // Matrix has zero rows - rowmap_out = rowmap_t(); - entries_out = entries_t(); - return; - } - numRows--; - // Sort in place - sort_crs_graph(rowmap_in, entries_in); - // Count entries per row into a new rowmap, in terms of merges that can be - // done - rowmap_out = rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - numRows + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, numRows), - Impl::MergedRowmapFunctor( - rowmap_out, rowmap_in, entries_in), - numCompressedEntries); - // Prefix sum to get rowmap - Impl::kk_exclusive_parallel_prefix_sum(numRows + 1, - rowmap_out); - entries_out = entries_t("SortedMerged entries", numCompressedEntries); - // Compute merged entries and values - Kokkos::parallel_for( - range_t(0, numRows), - Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); -} - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { - using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; - using entries_t = typename crsGraph_t::entries_type; - static_assert( - !std::is_const::value, - "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); - rowmap_t mergedRowmap; - entries_t mergedEntries; - sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, - mergedEntries); - return crsGraph_t(mergedEntries, mergedRowmap); -} - // Version to be called from host on a single array // Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), // but faster for smaller arrays. @@ -1125,39 +581,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, // For backward compatibility: keep the public interface accessible in // KokkosKernels::Impl:: namespace Impl { -template -[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, - const entries_t& entries) { - KokkosKernels::sort_crs_graph(rowmap, - entries); -} - -template -[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, - const entries_t& entries, - const values_t& values) { - KokkosKernels::sort_crs_matrix(rowmap, entries, values); -} - -template -[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { - KokkosKernels::sort_crs_matrix(A); -} - -template -[[deprecated]] void sort_and_merge_graph( - const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - KokkosKernels::sort_and_merge_graph( - rowmap_in, entries_in, rowmap_out, entries_out); -} - -template -[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return KokkosKernels::sort_and_merge_matrix(A); -} template < typename View, typename ExecSpace, typename Ordinal, diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index 655d89ba67..a6649f102b 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -49,7 +49,7 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_PrintUtils.hpp" #include "KokkosKernels_VectorUtils.hpp" diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp index 8992aa4bb8..322004c0b6 100644 --- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -46,7 +46,7 @@ #define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP #include "KokkosGraph_ExplicitCoarsening_impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" namespace KokkosGraph { namespace Experimental { @@ -86,8 +86,8 @@ void graph_explicit_coarsen( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::sort_and_merge_graph( + KokkosSparse::sort_and_merge_graph( coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; @@ -125,8 +125,8 @@ void graph_explicit_coarsen_with_inverse_map( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::sort_and_merge_graph( + KokkosSparse::sort_and_merge_graph( coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp index 50b2d1c2ef..aef089fd06 100644 --- a/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp +++ b/src/impl/tpls/KokkosKernels_tpl_handles_decl.hpp @@ -48,7 +48,7 @@ #include "KokkosBlas_tpl_spec.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" namespace KokkosKernels { namespace Impl { diff --git a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp index 84b5386a00..a5187986e5 100644 --- a/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp +++ b/src/impl/tpls/KokkosKernels_tpl_handles_def.hpp @@ -69,7 +69,7 @@ CusparseSingleton& CusparseSingleton::singleton() { #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE -#include "KokkosKernels_SparseUtils_rocsparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosKernels { namespace Impl { diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 77b76868f3..d0ea5cdc26 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -46,7 +46,7 @@ #define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_Controls.hpp" -#include "KokkosKernels_SparseUtils_mkl.hpp" +#include "KokkosSparse_Utils_mkl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include @@ -454,7 +454,7 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" // // From https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index d6f36c0a2b..0a92b91eb2 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -50,7 +50,7 @@ // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" namespace KokkosSparse { namespace Impl { @@ -385,7 +385,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, // rocSPARSE #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #include -#include "KokkosKernels_SparseUtils_rocsparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { namespace Impl { @@ -542,7 +542,7 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include -#include "KokkosKernels_SparseUtils_mkl.hpp" +#include "KokkosSparse_Utils_mkl.hpp" namespace KokkosSparse { namespace Impl { diff --git a/src/common/KokkosKernels_Controls.hpp b/src/sparse/KokkosKernels_Controls.hpp similarity index 100% rename from src/common/KokkosKernels_Controls.hpp rename to src/sparse/KokkosKernels_Controls.hpp diff --git a/src/common/KokkosKernels_Handle.hpp b/src/sparse/KokkosKernels_Handle.hpp similarity index 100% rename from src/common/KokkosKernels_Handle.hpp rename to src/sparse/KokkosKernels_Handle.hpp diff --git a/src/sparse/KokkosSparse_IOUtils.hpp b/src/sparse/KokkosSparse_IOUtils.hpp new file mode 100644 index 0000000000..d847fc9d10 --- /dev/null +++ b/src/sparse/KokkosSparse_IOUtils.hpp @@ -0,0 +1,1270 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSSPARSE_IOUTILS_HPP +#define _KOKKOSSPARSE_IOUTILS_HPP + +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace KokkosSparse { +namespace Impl { + +// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp +// file. +template +void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, + SizeType &nnz, OrdinalType row_size_variance, + OrdinalType bandwidth, ScalarType *&values, + SizeType *&rowPtr, OrdinalType *&colInd, + OrdinalType block_elem_count = 1) { + rowPtr = new SizeType[nrows + 1]; + + OrdinalType elements_per_row = nrows ? nnz / nrows : 0; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + int numRowEntries = elements_per_row + varianz; + if (numRowEntries < 0) numRowEntries = 0; + // Clamping numRowEntries above accomplishes 2 things: + // - If ncols is 0, numRowEntries will also be 0 + // - With numRowEntries at most 2/3 the number of columns, in the worst + // case + // 90% of insertions will succeed after 6 tries + if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols; + rowPtr[row + 1] = rowPtr[row] + numRowEntries; + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) { + while (true) { + OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + while (pos < 0) pos += ncols; + while (pos >= ncols) pos -= ncols; + + bool is_already_in_the_row = false; + for (SizeType j = rowPtr[row]; j < k; j++) { + if (colInd[j] == pos) { + is_already_in_the_row = true; + break; + } + } + if (!is_already_in_the_row) { + colInd[k] = pos; + break; + } + } + } + } + // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 + // + 50i) for complex types. + Kokkos::View valuesView( + values, nnz * block_elem_count); + ScalarType randStart, randEnd; + KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); + Kokkos::Random_XorShift64_Pool pool(13718); + Kokkos::fill_random(valuesView, pool, randStart, randEnd); +} + +template +void kk_sparseMatrix_generate_lower_upper_triangle( + char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz, + OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/, + ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) { + rowPtr = new SizeType[nrows + 1]; + + // OrdinalType elements_per_row = nnz/nrows; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + if (uplo == 'L') + rowPtr[row + 1] = rowPtr[row] + row + 1; + else + rowPtr[row + 1] = rowPtr[row] + ncols - (row); + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) { + if (uplo == 'L') + colInd[k] = k - rowPtr[row]; + else + colInd[k] = row + (k - rowPtr[row]); + values[k] = 1.0; + } + } +} + +template +void kk_diagonally_dominant_sparseMatrix_generate( + OrdinalType nrows, OrdinalType ncols, SizeType &nnz, + OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, + SizeType *&rowPtr, OrdinalType *&colInd, + ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { + rowPtr = new SizeType[nrows + 1]; + + OrdinalType elements_per_row = nnz / nrows; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + if (varianz < 1) varianz = 1; + if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; + rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; + if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is + rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + ScalarType total_values = 0; + std::unordered_set entriesInRow; + // We always add the diagonal entry (after this loop) + entriesInRow.insert(row); + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { + while (true) { + OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + while (pos < 0) pos += ncols; + while (pos >= ncols) pos -= ncols; + + if (entriesInRow.find(pos) == entriesInRow.end()) { + entriesInRow.insert(pos); + colInd[k] = pos; + values[k] = 100.0 * rand() / RAND_MAX - 50.0; + total_values += + Kokkos::Details::ArithTraits::abs(values[k]); + break; + } + } + } + + colInd[rowPtr[row + 1] - 1] = row; + values[rowPtr[row + 1] - 1] = total_values * diagDominance; + } +} + +// This function creates a diagonal sparse matrix for testing matrix operations. +// The elements on the diagonal are 1, 2, ..., n-1, n. +// If "invert" is true, it will return the inverse of the above diagonal matrix. +template +crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n, + const bool invert = false) { + typedef typename crsMat_t::ordinal_type ot; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + + row_map_view_t rowmap_view("rowmap_view", n + 1); + cols_view_t columns_view("colsmap_view", n); + values_view_t values_view("values_view", n); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= n; ++i) { + hr(i) = size_type(i); + } + + for (ot i = 0; i < n; ++i) { + hc(i) = lno_t(i); + if (invert) { + hv(i) = scalar_t(1.0) / (scalar_t(i + 1)); + } else { + hv(i) = scalar_t(i + 1); + } + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", n, values_view, static_graph); + return crsmat; +} + +template +crsMat_t kk_generate_diagonally_dominant_sparse_matrix( + typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth, + typename crsMat_t::const_value_type diagDominance = + 10 * Kokkos::ArithTraits::one()) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_diagonally_dominant_sparseMatrix_generate( + nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj, + diagDominance); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsMat_t kk_generate_triangular_sparse_matrix( + char uplo, typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_sparseMatrix_generate_lower_upper_triangle( + uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + Kokkos::fence(); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsMat_t kk_generate_sparse_matrix( + typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_sparseMatrix_generate( + nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +bsrMat_t kk_generate_sparse_matrix( + typename bsrMat_t::const_ordinal_type block_dim, + typename bsrMat_t::const_ordinal_type nrows, + typename bsrMat_t::const_ordinal_type ncols, + typename bsrMat_t::non_const_size_type &nnz, + typename bsrMat_t::const_ordinal_type row_size_variance, + typename bsrMat_t::const_ordinal_type bandwidth) { + typedef KokkosSparse::CrsMatrix< + typename bsrMat_t::value_type, typename bsrMat_t::ordinal_type, + typename bsrMat_t::device_type, typename bsrMat_t::memory_traits, + typename bsrMat_t::size_type> + crsMat_t; + + const auto crs_mtx = kk_generate_sparse_matrix( + nrows * block_dim, ncols * block_dim, nnz, row_size_variance, bandwidth); + bsrMat_t bsrmat(crs_mtx, block_dim); + return bsrmat; +} +// TODO: need to fix the size_type. All over the reading inputs are lno_t. + +template +void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, + idx *lower_triangle_srcs, + idx *lower_triangle_dests) { + idx ind = 0; + for (idx i = 0; i < nv; ++i) { + idx xb = xadj[i]; + idx xe = xadj[i + 1]; + for (idx j = xb; j < xe; ++j) { + idx dst = adj[j]; + if (i < dst) { + lower_triangle_srcs[ind] = i; + lower_triangle_dests[ind++] = dst; + } + } + } +} + +template +void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) { + for (idx i = 0; i < nv; ++i) { + idx xb = xadj[i]; + idx xe = xadj[i + 1]; + for (idx j = xb; j < xe; ++j) { + srcs[j] = i; + } + } +} + +template +void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests, + wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) { + std::vector> edges(ne); + for (size_type i = 0; i < ne; ++i) { + edges[i].src = srcs[i]; + edges[i].dst = dests[i]; + edges[i].ew = ew[i]; + } + std::sort(edges.begin(), edges.begin() + ne); + + size_type eind = 0; + for (lno_t i = 0; i < nv; ++i) { + (xadj)[i] = eind; + while (edges[eind].src == i) { + (adj)[eind] = edges[eind].dst; + (*crs_ew)[eind] = edges[eind].ew; + ++eind; + } + } + xadj[nv] = eind; +} + +template +void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs, + in_lno_t *dests, size_type *xadj, + lno_t *adj) { + std::vector> edges(ne * 2); + for (size_type i = 0; i < ne; ++i) { + edges[i * 2].src = srcs[i]; + edges[i * 2].dst = dests[i]; + + edges[i * 2 + 1].src = dests[i]; + edges[i * 2 + 1].dst = srcs[i]; + } +#ifdef KOKKOSKERNELS_HAVE_OUTER +#include +#include +#include +#include + __gnu_parallel::parallel_sort_mwms *>( + &(edges[0]), &(edges[0]) + ne * 2, + std::less>(), 64); +#else + std::sort(edges.begin(), edges.begin() + ne * 2); +#endif + + size_type eind = 0; + for (lno_t i = 0; i < nv; ++i) { + (xadj)[i] = eind; + while (edges[eind].src == i) { + (adj)[eind] = edges[eind].dst; + //(*crs_ew)[eind] = edges[eind].ew; + ++eind; + } + } + xadj[nv] = eind; +} + +template +void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename, std::ios::out | std::ios::binary); + myFile.write((char *)&nv, sizeof(lno_t)); + myFile.write((char *)&ne, sizeof(size_type)); + myFile.write((char *)xadj, sizeof(size_type) * (nv + 1)); + + myFile.write((char *)adj, sizeof(lno_t) * (ne)); + + myFile.write((char *)ew, sizeof(scalar_t) * (ne)); + + myFile.close(); +} + +template +void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename, std::ios::out); + myFile << nv << " " << ne << std::endl; + + for (lno_t i = 0; i <= nv; ++i) { + myFile << xadj[i] << " "; + } + myFile << std::endl; + + for (lno_t i = 0; i < nv; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << adj[j] << " "; + } + myFile << std::endl; + } + for (size_type i = 0; i < ne; ++i) { + myFile << ew[i] << " "; + } + myFile << std::endl; + + myFile.close(); +} + +template +void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t * /*ew*/, + const char *filename) { + std::ofstream ff(filename); + ff << "AdjacencyGraph" << std::endl; + ff << nv << std::endl << ne << std::endl; + for (lno_t i = 0; i < nv; ++i) { + ff << xadj[i] << std::endl; + } + for (size_type i = 0; i < ne; ++i) { + ff << adj[i] << std::endl; + } + ff.close(); +} + +// MM: types and utility functions for parsing the MatrixMarket format +namespace MM { +enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR }; +enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY }; +enum MtxField { + UNDEFINED_FIELD, + REAL, // includes both float and double + COMPLEX, // includes complex and complex + INTEGER, // includes all integer types + PATTERN // not a type, but means the value for every entry is 1 +}; +enum MtxSym { + UNDEFINED_SYMMETRY, + GENERAL, + SYMMETRIC, // A(i, j) = A(j, i) + SKEW_SYMMETRIC, // A(i, j) = -A(j, i) + HERMITIAN // A(i, j) = a + bi; A(j, i) = a - bi +}; + +// readScalar/writeScalar: read and write a scalar in the form that it appears +// in an .mtx file. The >> and << operators won't work, because complex appears +// as "real imag", not "(real, imag)" +template +scalar_t readScalar(std::istream &is) { + scalar_t val; + is >> val; + return val; +} + +template <> +inline Kokkos::complex readScalar(std::istream &is) { + float r, i; + is >> r; + is >> i; + return Kokkos::complex(r, i); +} + +template <> +inline Kokkos::complex readScalar(std::istream &is) { + double r, i; + is >> r; + is >> i; + return Kokkos::complex(r, i); +} + +template +void writeScalar(std::ostream &os, scalar_t val) { + os << val; +} + +template <> +inline void writeScalar(std::ostream &os, Kokkos::complex val) { + os << val.real() << ' ' << val.imag(); +} + +template <> +inline void writeScalar(std::ostream &os, Kokkos::complex val) { + os << val.real() << ' ' << val.imag(); +} + +// symmetryFlip: given a value for A(i, j), return the value that +// should be inserted at A(j, i) (if any) +template +scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) { + if (symFlag == SKEW_SYMMETRIC) return -val; + return val; +} + +template <> +inline Kokkos::complex symmetryFlip(Kokkos::complex val, + MtxSym symFlag) { + if (symFlag == HERMITIAN) + return Kokkos::conj(val); + else if (symFlag == SKEW_SYMMETRIC) + return -val; + return val; +} + +template <> +inline Kokkos::complex symmetryFlip(Kokkos::complex val, + MtxSym symFlag) { + if (symFlag == HERMITIAN) + return Kokkos::conj(val); + else if (symFlag == SKEW_SYMMETRIC) + return -val; + return val; +} +} // namespace MM + +template +void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, + const size_type *xadj, const lno_t *adj, + const scalar_t *vals, const char *filename) { + std::ofstream myFile(filename); + myFile << "%%MatrixMarket matrix coordinate "; + if (std::is_same>::value || + std::is_same>::value) + myFile << "complex"; + else + myFile << "real"; + myFile << " general\n"; + myFile << nrows << " " << ncols << " " << nentries << '\n'; + myFile << std::setprecision(17) << std::scientific; + for (lno_t i = 0; i < nrows; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << i + 1 << " " << adj[j] + 1 << " "; + MM::writeScalar(myFile, vals[j]); + myFile << '\n'; + } + } + myFile.close(); +} + +template +void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename); + myFile << "%%MatrixMarket matrix coordinate "; + if (std::is_same>::value || + std::is_same>::value) + myFile << "complex"; + else + myFile << "real"; + myFile << " general\n"; + myFile << nv << " " << nv << " " << ne << '\n'; + myFile << std::setprecision(8) << std::scientific; + for (lno_t i = 0; i < nv; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << i + 1 << " " << (adj)[j] + 1 << " "; + MM::writeScalar(myFile, ew[j]); + myFile << '\n'; + } + } + + myFile.close(); +} + +template +void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::ifstream myFile(filename, std::ios::in | std::ios::binary); + + myFile.read((char *)nv, sizeof(lno_t)); + myFile.read((char *)ne, sizeof(size_type)); + KokkosKernels::Impl::md_malloc(xadj, *nv + 1); + KokkosKernels::Impl::md_malloc(adj, *ne); + KokkosKernels::Impl::md_malloc(ew, *ne); + myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1)); + myFile.read((char *)*adj, sizeof(lno_t) * (*ne)); + myFile.read((char *)*ew, sizeof(scalar_t) * (*ne)); + myFile.close(); +} + +// When Kokkos issue #2313 is resolved, can delete +// parseScalar and just use operator>> +template +scalar_t parseScalar(std::istream &is) { + scalar_t val; + is >> val; + return val; +} + +template <> +inline Kokkos::complex parseScalar(std::istream &is) { + std::complex val; + is >> val; + return Kokkos::complex(val); +} + +template <> +inline Kokkos::complex parseScalar(std::istream &is) { + std::complex val; + is >> val; + return Kokkos::complex(val); +} + +template +void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::ifstream myFile(filename, std::ios::in); + myFile >> *nv >> *ne; + + KokkosKernels::Impl::md_malloc(xadj, *nv + 1); + KokkosKernels::Impl::md_malloc(adj, *ne); + KokkosKernels::Impl::md_malloc(ew, *ne); + + for (lno_t i = 0; i <= *nv; ++i) { + myFile >> (*xadj)[i]; + } + + for (size_type i = 0; i < *ne; ++i) { + myFile >> (*adj)[i]; + } + for (size_type i = 0; i < *ne; ++i) { + (*ew)[i] = parseScalar(myFile); + } + myFile.close(); +} + +template +void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) { + typedef typename crs_matrix_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crs_matrix_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::value_type offset_t; + typedef typename cols_view_t::value_type lno_t; + typedef typename values_view_t::value_type scalar_t; + typedef typename values_view_t::size_type size_type; + + size_type nnz = a_crsmat.nnz(); + + auto a_rowmap_view = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), a_crsmat.graph.row_map); + auto a_entries_view = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), a_crsmat.graph.entries); + auto a_values_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values); + offset_t *a_rowmap = const_cast(a_rowmap_view.data()); + lno_t *a_entries = a_entries_view.data(); + scalar_t *a_values = a_values_view.data(); + + std::string strfilename(filename); + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm")) { + write_matrix_mtx( + a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap, + a_entries, a_values, filename); + return; + } else if (a_crsmat.numRows() != a_crsmat.numCols()) { + throw std::runtime_error( + "For formats other than MatrixMarket (suffix .mm or .mtx),\n" + "write_kokkos_crst_matrix only supports square matrices"); + } + if (KokkosKernels::Impl::endswith(strfilename, ".bin")) { + write_graph_bin( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else if (KokkosKernels::Impl::endswith(strfilename, ".ligra")) { + write_graph_ligra( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) { + write_graph_crs( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else { + std::string errMsg = + std::string("write_kokkos_crst_matrix: File extension on ") + filename + + " does not correspond to a known format"; + throw std::runtime_error(errMsg); + } +} + +template +int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, + size_type **xadj, lno_t **adj, scalar_t **ew, + bool symmetrize = false, bool remove_diagonal = true, + bool transpose = false) { + using namespace MM; + std::ifstream mmf(fileName, std::ifstream::in); + if (!mmf.is_open()) { + throw std::runtime_error("File cannot be opened\n"); + } + + std::string fline = ""; + getline(mmf, fline); + + if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') { + throw std::runtime_error("Invalid MM file. Line-1\n"); + } + + // make sure every required field is in the file, by initializing them to + // UNDEFINED_* + MtxObject mtx_object = UNDEFINED_OBJECT; + MtxFormat mtx_format = UNDEFINED_FORMAT; + MtxField mtx_field = UNDEFINED_FIELD; + MtxSym mtx_sym = UNDEFINED_SYMMETRY; + + if (fline.find("matrix") != std::string::npos) { + mtx_object = MATRIX; + } else if (fline.find("vector") != std::string::npos) { + mtx_object = VECTOR; + throw std::runtime_error( + "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()"); + } + + if (fline.find("coordinate") != std::string::npos) { + // sparse + mtx_format = COORDINATE; + } else if (fline.find("array") != std::string::npos) { + // dense + mtx_format = ARRAY; + } + + if (fline.find("real") != std::string::npos || + fline.find("double") != std::string::npos) { + if (std::is_same::value || + std::is_same::value) + mtx_field = REAL; + else { + if (!std::is_floating_point::value) + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with float or double typed " + "MatrixMarket file."); + else + mtx_field = REAL; + } + } else if (fline.find("complex") != std::string::npos) { + if (!(std::is_same>::value || + std::is_same>::value)) + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket " + "file."); + else + mtx_field = COMPLEX; + } else if (fline.find("integer") != std::string::npos) { + if (std::is_integral::value || + std::is_floating_point::value || + std::is_same::value || + std::is_same::value) + mtx_field = INTEGER; + else + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket " + "file."); + } else if (fline.find("pattern") != std::string::npos) { + mtx_field = PATTERN; + // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so + // nothing to check here + } + + if (fline.find("general") != std::string::npos) { + mtx_sym = GENERAL; + } else if (fline.find("skew-symmetric") != std::string::npos) { + mtx_sym = SKEW_SYMMETRIC; + } else if (fline.find("symmetric") != std::string::npos) { + // checking for "symmetric" after "skew-symmetric" because it's a substring + mtx_sym = SYMMETRIC; + } else if (fline.find("hermitian") != std::string::npos || + fline.find("Hermitian") != std::string::npos) { + mtx_sym = HERMITIAN; + } + // Validate the matrix attributes + if (mtx_format == ARRAY) { + if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL; + if (mtx_sym != GENERAL) + throw std::runtime_error( + "array format MatrixMarket file must have general symmetry (optional " + "to include \"general\")"); + } + if (mtx_object == UNDEFINED_OBJECT) + throw std::runtime_error( + "MatrixMarket file header is missing the object type."); + if (mtx_format == UNDEFINED_FORMAT) + throw std::runtime_error("MatrixMarket file header is missing the format."); + if (mtx_field == UNDEFINED_FIELD) + throw std::runtime_error( + "MatrixMarket file header is missing the field type."); + if (mtx_sym == UNDEFINED_SYMMETRY) + throw std::runtime_error( + "MatrixMarket file header is missing the symmetry type."); + + while (1) { + getline(mmf, fline); + if (fline[0] != '%') break; + } + std::stringstream ss(fline); + lno_t nr = 0, nc = 0; + size_type nnz = 0; + ss >> nr >> nc; + if (mtx_format == COORDINATE) + ss >> nnz; + else + nnz = nr * nc; + size_type numEdges = nnz; + symmetrize = symmetrize || mtx_sym != GENERAL; + if (symmetrize && nr != nc) { + throw std::runtime_error("A non-square matrix cannot be symmetrized."); + } + if (mtx_format == ARRAY) { + // Array format only supports general symmetry and non-pattern + if (symmetrize) + throw std::runtime_error( + "array format MatrixMarket file cannot be symmetrized."); + if (mtx_field == PATTERN) + throw std::runtime_error( + "array format MatrixMarket file can't have \"pattern\" field type."); + } + if (symmetrize) { + numEdges = 2 * nnz; + } + // numEdges is only an upper bound (diagonal entries may be removed) + std::vector> edges(numEdges); + size_type nE = 0; + lno_t numDiagonal = 0; + for (size_type i = 0; i < nnz; ++i) { + getline(mmf, fline); + std::stringstream ss2(fline); + struct KokkosKernels::Impl::Edge tmp; + // read source, dest (edge) and weight (value) + lno_t s, d; + scalar_t w; + if (mtx_format == ARRAY) { + // In array format, entries are listed in column major order, + // so the row and column can be determined just from the index i + //(but make them 1-based indices, to match the way coordinate works) + s = i % nr + 1; // row + d = i / nr + 1; // col + } else { + // In coordinate format, row and col of each entry is read from file + ss2 >> s >> d; + } + if (mtx_field == PATTERN) + w = 1; + else + w = readScalar(ss2); + if (!transpose) { + tmp.src = s - 1; + tmp.dst = d - 1; + tmp.ew = w; + } else { + tmp.src = d - 1; + tmp.dst = s - 1; + tmp.ew = w; + } + if (tmp.src == tmp.dst) { + numDiagonal++; + if (!remove_diagonal) { + edges[nE++] = tmp; + } + continue; + } + edges[nE++] = tmp; + if (symmetrize) { + struct KokkosKernels::Impl::Edge tmp2; + tmp2.src = tmp.dst; + tmp2.dst = tmp.src; + // the symmetrized value is w, -w or conj(w) if mtx_sym is + // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively. + tmp2.ew = symmetryFlip(tmp.ew, mtx_sym); + edges[nE++] = tmp2; + } + } + mmf.close(); + std::sort(edges.begin(), edges.begin() + nE); + if (transpose) { + lno_t tmp = nr; + nr = nc; + nc = tmp; + } + // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt + *nrows = nr; + *ncols = nc; + *ne = nE; + //*xadj = new idx[nr + 1]; + KokkosKernels::Impl::md_malloc(xadj, nr + 1); + //*adj = new idx[nE]; + KokkosKernels::Impl::md_malloc(adj, nE); + //*ew = new wt[nE]; + KokkosKernels::Impl::md_malloc(ew, nE); + size_type eind = 0; + size_type actual = 0; + for (lno_t i = 0; i < nr; ++i) { + (*xadj)[i] = actual; + bool is_first = true; + while (eind < nE && edges[eind].src == i) { + if (is_first || !symmetrize || eind == 0 || + (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) { + (*adj)[actual] = edges[eind].dst; + (*ew)[actual] = edges[eind].ew; + ++actual; + } + is_first = false; + ++eind; + } + } + (*xadj)[nr] = actual; + *ne = actual; + return 0; +} + +// Version of read_mtx which does not capture the number of columns. +// This is the old interface; it's kept for backwards compatibility. +template +int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj, + lno_t **adj, scalar_t **ew, bool symmetrize = false, + bool remove_diagonal = true, bool transpose = false) { + lno_t ncol; // will discard + return read_mtx(fileName, nv, &ncol, ne, xadj, + adj, ew, symmetrize, + remove_diagonal, transpose); +} + +template +void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::string strfilename(filename); + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm")) { + read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); + } + + else if (KokkosKernels::Impl::endswith(strfilename, ".bin")) { + read_graph_bin(nv, ne, xadj, adj, ew, filename); + } + + else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) { + read_graph_crs(nv, ne, xadj, adj, ew, filename); + } + + else { + throw std::runtime_error("Reader is not available\n"); + } +} + +template +crsMat_t read_kokkos_crst_matrix(const char *filename_) { + std::string strfilename(filename_); + bool isMatrixMarket = + KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm"); + + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::value_type size_type; + typedef typename cols_view_t::value_type lno_t; + typedef typename values_view_t::value_type scalar_t; + + lno_t nr, nc, *adj; + size_type *xadj, nnzA; + scalar_t *values; + + if (isMatrixMarket) { + // MatrixMarket file contains the exact number of columns + read_mtx(filename_, &nr, &nc, &nnzA, &xadj, + &adj, &values, false, false, false); + } else { + //.crs and .bin files don't contain #cols, so will compute it later based on + // the entries + read_matrix(&nr, &nnzA, &xadj, &adj, &values, + filename_); + } + + row_map_view_t rowmap_view("rowmap_view", nr + 1); + cols_view_t columns_view("colsmap_view", nnzA); + values_view_t values_view("values_view", nnzA); + + { + Kokkos::View> + hr(xadj, nr + 1); + Kokkos::View> + hc(adj, nnzA); + Kokkos::View> + hv(values, nnzA); + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + if (!isMatrixMarket) { + KokkosKernels::Impl::kk_view_reduce_max( + nnzA, columns_view, nc); + nc++; + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsGraph_t read_kokkos_crst_graph(const char *filename_) { + typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsGraph_t::entries_type::non_const_type cols_view_t; + + typedef typename row_map_view_t::value_type size_type; + typedef typename cols_view_t::value_type lno_t; + typedef double scalar_t; + + lno_t nv, *adj; + size_type *xadj, nnzA; + scalar_t *values; + read_matrix(&nv, &nnzA, &xadj, &adj, &values, + filename_); + + row_map_view_t rowmap_view("rowmap_view", nv + 1); + cols_view_t columns_view("colsmap_view", nnzA); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + + for (lno_t i = 0; i <= nv; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnzA; ++i) { + hc(i) = adj[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + } + + lno_t ncols = 0; + KokkosKernels::Impl::kk_view_reduce_max( + nnzA, columns_view, ncols); + ncols += 1; + + crsGraph_t static_graph(columns_view, rowmap_view, ncols); + delete[] xadj; + delete[] adj; + delete[] values; + return static_graph; +} + +template +inline void kk_sequential_create_incidence_matrix( + nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj, + size_type *i_adj // output. preallocated +) { + std::vector c_xadj(num_rows); + for (nnz_lno_t i = 0; i < num_rows; i++) { + c_xadj[i] = xadj[i]; + } + int eCnt = 0; + for (nnz_lno_t i = 0; i < num_rows; i++) { + size_type begin = xadj[i]; + size_type end = xadj[i + 1]; + nnz_lno_t adjsize = end - begin; + + for (nnz_lno_t j = 0; j < adjsize; j++) { + size_type aind = j + begin; + nnz_lno_t col = adj[aind]; + if (i < col) { + i_adj[c_xadj[i]++] = eCnt; + i_adj[c_xadj[col]++] = eCnt++; + } + } + } + + for (nnz_lno_t i = 0; i < num_rows; i++) { + if (c_xadj[i] != xadj[i + 1]) { + std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i] + << " xadj[i+1]:" << xadj[i + 1] << std::endl; + } + } +} + +template +inline void kk_sequential_create_incidence_matrix_transpose( + const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj, + const nnz_lno_t *adj, + size_type *i_xadj, // output. preallocated + nnz_lno_t *i_adj // output. preallocated +) { + for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) { + i_xadj[i] = i * 2; + } + int eCnt = 0; + for (nnz_lno_t i = 0; i < num_rows; i++) { + size_type begin = xadj[i]; + size_type end = xadj[i + 1]; + nnz_lno_t adjsize = end - begin; + + for (nnz_lno_t j = 0; j < adjsize; j++) { + size_type aind = j + begin; + nnz_lno_t col = adj[aind]; + if (i < col) { + i_adj[eCnt++] = i; + i_adj[eCnt++] = col; + } + } + } +} + +} // namespace Impl +} // namespace KokkosKernels +#endif // _KOKKOSSPARSE_IOUTILS_HPP diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp new file mode 100644 index 0000000000..03d51386e5 --- /dev/null +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -0,0 +1,725 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSSPARSE_SORTCRS_HPP +#define _KOKKOSSPARSE_SORTCRS_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Sorting.hpp" + +namespace KokkosSparse { + +// ---------------------------------- +// BSR matrix/graph sorting utilities +// ---------------------------------- + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values); + +template +void sort_bsr_matrix(const bsrMat_t& A); + +// ---------------------------------- +// CRS matrix/graph sorting utilities +// ---------------------------------- + +// The sort_crs* functions sort the adjacent column list for each row into +// ascending order. + +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values); + +template +void sort_crs_matrix(const crsMat_t& A); + +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); + +template +void sort_crs_graph(const crsGraph_t& G); + +// sort_and_merge_matrix produces a new matrix which is equivalent to A but is +// sorted and has no duplicated entries: each (i, j) is unique. Values for +// duplicated entries are summed. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A); + +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G); + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out); + +namespace Impl { + +template +struct SortCrsMatrixFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + using values_managed_t = Kokkos::View; + + SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, + const entries_t& entries_, const values_t& values_) + : rowmap(rowmap_), entries(entries_), values(values_) { + if (usingRangePol) { + entriesAux = entries_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), + entries.extent(0)); + valuesAux = values_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), + values.extent(0)); + } + // otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort2( + (unsigned_lno_t*)entries.data() + rowStart, + (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, + valuesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort2( + entries.data() + rowStart, values.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; + values_t values; + values_managed_t valuesAux; +}; + +template +struct SortCrsGraphFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + + SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, + const entries_t& entries_) + : rowmap(rowmap_), entries(entries_) { + if (usingRangePol) { + entriesAux = entries_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), + entries.extent(0)); + } + // otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort( + (unsigned_lno_t*)entries.data() + rowStart, + (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort( + entries.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; +}; + +template +struct MergedRowmapFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using c_rowmap_t = typename rowmap_t::const_type; + + // Precondition: entries are sorted within each row + MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, + const entries_t& entries_) + : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with + mergedCounts(row) = 0; + return; + } + // Otherwise, the first entry in the row exists + lno_t uniqueEntries = 1; + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (entries(j - 1) != entries(j)) uniqueEntries++; + } + mergedCounts(row) = uniqueEntries; + lnewNNZ += uniqueEntries; + if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; + } + + rowmap_t mergedCounts; + c_rowmap_t rowmap; + entries_t entries; +}; + +template +struct MatrixMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + + // Precondition: entries are sorted within each row + MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + const values_t& values_, + const rowmap_t& mergedRowmap_, + const entries_t& mergedEntries_, + const values_t& mergedValues_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + mergedRowmap(mergedRowmap_), + mergedEntries(mergedEntries_), + mergedValues(mergedValues_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + scalar_t accumVal = values(rowBegin); + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol == entries(j)) { + // accumulate + accumVal += values(j); + } else { + // write out and reset + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + insertPos++; + accumVal = values(j); + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + values_t values; + rowmap_t mergedRowmap; + entries_t mergedEntries; + values_t mergedValues; +}; + +template +struct GraphMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + // Precondition: entries are sorted within each row + GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, + const entries_t& mergedEntries_) + : rowmap(rowmap_), + entries(entries_), + mergedRowmap(mergedRowmap_), + mergedEntries(mergedEntries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol != entries(j)) { + // write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + +template +KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { + T t = a; + a = b; + b = t; +} + +template +struct sort_bsr_functor { + using lno_t = typename entries_type::non_const_value_type; + + row_map_type rowmap; + entries_type entries; + values_type values; + const lno_t blocksize; + + sort_bsr_functor(row_map_type rowmap_, entries_type entries_, + values_type values_, const lno_t blocksize_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + blocksize(blocksize_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + const lno_t rowStart = rowmap(i); + const lno_t rowSize = rowmap(i + 1) - rowStart; + auto* e = entries.data() + rowStart; + auto* v = values.data() + rowStart * blocksize; + bool done = false; + while (!done) { + done = true; + for (lno_t j = 1; j < rowSize; ++j) { + const lno_t jp = j - 1; + if (e[jp] <= e[j]) continue; + Impl::kk_swap(e[jp], e[j]); + auto const vb = v + j * blocksize; + auto const vbp = v + jp * blocksize; + for (lno_t k = 0; k < blocksize; + ++k) // std::swap_ranges(vb, vb + blocksize, vbp); + Impl::kk_swap(vb[k], vbp[k]); + done = false; + } + } + } +}; + +} // namespace Impl + +// Sort a CRS matrix: within each row, sort entries ascending by column. +// At the same time, permute the values. +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + Impl::SortCrsMatrixFunctor + funct(useRadix, rowmap, entries, values); + if (useRadix) { + Kokkos::parallel_for("sort_crs_matrix", + Kokkos::RangePolicy(0, numRows), + funct); + } else { + // Try to get teamsize to be largest power of 2 not greater than avg entries + // per row + // TODO (probably important for performnce): add thread-level sort also, and + // use that for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (idealTeamSize < avgDeg / 2) { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); + } +} + +template +void sort_crs_matrix(const crsMat_t& A) { + // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using exec_space = typename crsMat_t::execution_space; + // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the + // entries and CrsMatrix values are non-const (so sorting them directly + // is allowed) + sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); +} + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + // TODO: this is O(N^2) mock for debugging - do regular implementation based + // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general + // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + const lno_t blocksize = blockdim * blockdim; + + assert(values.extent(0) == entries.extent(0) * blocksize); + Impl::sort_bsr_functor bsr_sorter( + rowmap, entries, values, blocksize); + Kokkos::parallel_for("sort_bsr_matrix", + Kokkos::RangePolicy(0, numRows), + bsr_sorter); +} + +// Sort a BSR matrix (like CRS but single values are replaced with contignous +// blocks) +template +void sort_bsr_matrix(const bsrMat_t& A) { + // NOTE: unlike rowmap, entries and values are non-const, so we can sort them + // directly + sort_bsr_matrix( + A.blockDim(), A.graph.row_map, A.graph.entries, A.values); +} + +// Sort a CRS graph: within each row, sort entries ascending by column. +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + Impl::SortCrsGraphFunctor funct( + useRadix, rowmap, entries); + if (useRadix) { + Kokkos::parallel_for("sort_crs_graph", + Kokkos::RangePolicy(0, numRows), + funct); + } else { + // Try to get teamsize to be largest power of 2 less than or equal to + // half the entries per row. 0.5 * #entries is bitonic's parallelism within + // a row. + // TODO (probably important for performnce): add thread-level sort also, and + // use that for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (idealTeamSize < avgDeg / 2) { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); + } +} + +template +void sort_crs_graph(const crsGraph_t& G) { + static_assert( + !std::is_const::value, + "sort_crs_graph requires StaticCrsGraph entries to be non-const."); + sort_crs_graph(G.row_map, G.entries); +} + +// Sort the rows of matrix, and merge duplicate entries. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + using c_rowmap_t = typename crsMat_t::row_map_type; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using size_type = typename rowmap_t::non_const_value_type; + using exec_space = typename crsMat_t::execution_space; + using range_t = Kokkos::RangePolicy; + sort_crs_matrix(A); + // Count entries per row into a new rowmap, in terms of merges that can be + // done + rowmap_t mergedRowmap( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), + A.numRows() + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, A.numRows()), + Impl::MergedRowmapFunctor( + mergedRowmap, A.graph.row_map, A.graph.entries), + numCompressedEntries); + // Prefix sum to get rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(A.numRows() + 1, + mergedRowmap); + entries_t mergedEntries("SortedMerged entries", numCompressedEntries); + values_t mergedValues("SortedMerged values", numCompressedEntries); + // Compute merged entries and values + Kokkos::parallel_for( + range_t(0, A.numRows()), + Impl::MatrixMergedEntriesFunctor( + A.graph.row_map, A.graph.entries, A.values, mergedRowmap, + mergedEntries, mergedValues)); + // Finally, construct the new compressed matrix + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), + numCompressedEntries, mergedValues, mergedRowmap, + mergedEntries); +} + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using range_t = Kokkos::RangePolicy; + using const_rowmap_t = typename rowmap_t::const_type; + lno_t numRows = rowmap_in.extent(0); + if (numRows <= 1) { + // Matrix has zero rows + rowmap_out = rowmap_t(); + entries_out = entries_t(); + return; + } + numRows--; + // Sort in place + sort_crs_graph(rowmap_in, entries_in); + // Count entries per row into a new rowmap, in terms of merges that can be + // done + rowmap_out = rowmap_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), + numRows + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, numRows), + Impl::MergedRowmapFunctor( + rowmap_out, rowmap_in, entries_in), + numCompressedEntries); + // Prefix sum to get rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(numRows + 1, + rowmap_out); + entries_out = entries_t("SortedMerged entries", numCompressedEntries); + // Compute merged entries and values + Kokkos::parallel_for( + range_t(0, numRows), + Impl::GraphMergedEntriesFunctor( + rowmap_in, entries_in, rowmap_out, entries_out)); +} + +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; + using entries_t = typename crsGraph_t::entries_type; + static_assert( + !std::is_const::value, + "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); + rowmap_t mergedRowmap; + entries_t mergedEntries; + sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, + mergedEntries); + return crsGraph_t(mergedEntries, mergedRowmap); +} + +} // namespace KokkosSparse + +namespace KokkosKernels { + +// ---------------------------------- +// BSR matrix/graph sorting utilities +// ---------------------------------- + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +[[deprecated]] +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values); +} + +template +[[deprecated]] +void sort_bsr_matrix(const bsrMat_t& A) { + KokkosSparse::sort_bsr_matrix(A); +} + +// ---------------------------------- +// CRS matrix/graph sorting utilities +// ---------------------------------- + +// The sort_crs* functions sort the adjacent column list for each row into +// ascending order. + +template +[[deprecated]] +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + KokkosSparse::sort_crs_matrix(rowmap, entries, values); +} + +template +[[deprecated]] +void sort_crs_matrix(const crsMat_t& A) { + KokkosSparse::sort_crs_matrix(A); +} + +template +[[deprecated]] +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { + KokkosSparse::sort_crs_graph(rowmap, entries); +} + +template +[[deprecated]] +void sort_crs_graph(const crsGraph_t& G) { + KokkosSparse::sort_crs_graph(G); +} + +// sort_and_merge_matrix produces a new matrix which is equivalent to A but is +// sorted and has no duplicated entries: each (i, j) is unique. Values for +// duplicated entries are summed. +template +[[deprecated]] +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + KokkosSparse::sort_and_merge_matrix(A); +} + +template +[[deprecated]] +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + KokkosSparse::sort_and_merge_graph(G); +} + +template +[[deprecated]] +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, entries_out); +} + +// For backward compatibility: keep the public interface accessible in +// KokkosKernels::Impl:: +namespace Impl { +template +[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, + const entries_t& entries) { + KokkosKernels::sort_crs_graph(rowmap, + entries); +} + +template +[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { + KokkosKernels::sort_crs_matrix(rowmap, entries, values); +} + +template +[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { + KokkosKernels::sort_crs_matrix(A); +} + +template +[[deprecated]] void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) { + KokkosKernels::sort_and_merge_graph( + rowmap_in, entries_in, rowmap_out, entries_out); +} + +template +[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return KokkosKernels::sort_and_merge_matrix(A); +} + +} // namespace Impl +} // namespace KokkosKernels + +#endif // _KOKKOSSPARSE_SORTCRS_HPP diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/sparse/KokkosSparse_Utils.hpp similarity index 100% rename from src/common/KokkosKernels_SparseUtils.hpp rename to src/sparse/KokkosSparse_Utils.hpp diff --git a/src/common/KokkosKernels_SparseUtils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp similarity index 100% rename from src/common/KokkosKernels_SparseUtils_cusparse.hpp rename to src/sparse/KokkosSparse_Utils_cusparse.hpp diff --git a/src/common/KokkosKernels_SparseUtils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp similarity index 100% rename from src/common/KokkosKernels_SparseUtils_mkl.hpp rename to src/sparse/KokkosSparse_Utils_mkl.hpp diff --git a/src/common/KokkosKernels_SparseUtils_rocsparse.hpp b/src/sparse/KokkosSparse_Utils_rocsparse.hpp similarity index 100% rename from src/common/KokkosKernels_SparseUtils_rocsparse.hpp rename to src/sparse/KokkosSparse_Utils_rocsparse.hpp diff --git a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp index 796ee579bd..6d354047cf 100644 --- a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp +++ b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp @@ -56,7 +56,7 @@ defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) #include "cholmod.h" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_sptrsv_supernode.hpp" namespace KokkosSparse { diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index fa9a607be7..1c86121bde 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -63,7 +63,7 @@ #include "KokkosBatched_Trmm_Decl.hpp" #include "KokkosBatched_Trmm_Serial_Impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "KokkosSparse_sptrsv.hpp" namespace KokkosSparse { @@ -597,7 +597,7 @@ host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, #endif // sort column ids per row - KokkosKernels::sort_crs_graph(hr, hc); #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time_seconds = timer.seconds(); diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 0f265dfbc4..62b86ca72e 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -52,7 +52,7 @@ #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" // FOR DEBUGGING #include "KokkosBlas1_nrm2.hpp" @@ -979,8 +979,8 @@ class PointGaussSeidel { gsHandle->set_long_row_x(long_row_x); } else { // Just sort rows by ID. - KokkosKernels::sort_crs_graph(color_xadj, color_adj); + KokkosSparse::sort_crs_graph(color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE MyExecSpace().fence(); diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 2131cec751..c4ae435f55 100644 --- a/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -46,7 +46,7 @@ #define _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP #include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "Kokkos_ArithTraits.hpp" namespace KokkosSparse { @@ -593,8 +593,8 @@ void spadd_symbolic_impl( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", range_type(0, nrows), unmergedSum); // sort the unmerged sum - KokkosKernels::sort_crs_matrix( + KokkosSparse::sort_crs_matrix( c_rowmap_upperbound, c_entries_uncompressed, ab_perm); ordinal_view_t a_pos( Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp index 9b4c28c877..dadc944b09 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl.hpp @@ -47,7 +47,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index d1bfb3db5c..9a6ab70f9e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -46,7 +46,7 @@ #define _KOKKOSSPGEMMMKL_HPP #include "KokkosKernels_config.h" -#include "KokkosKernels_SparseUtils_mkl.hpp" +#include "KokkosSparse_Utils_mkl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include "mkl_spblas.h" diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 19bc5ec163..6adafd6319 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -57,7 +57,7 @@ // needed for classical GS #include "KokkosSparse_sptrsv.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_gauss_seidel_handle.hpp" @@ -854,10 +854,10 @@ class TwostageGaussSeidel { // values // CuSparse needs matrix sorted by column indexes for each row // TODO: may need to move this to symbolic/numeric of sptrsv - KokkosKernels::sort_crs_matrix( + KokkosSparse::sort_crs_matrix( rowmap_viewL, column_viewL, values_viewL); - KokkosKernels::sort_crs_matrix( rowmap_viewU, column_viewU, values_viewU); diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp index 9d6958e816..cc4204d076 100644 --- a/unit_test/common/Test_Common.hpp +++ b/unit_test/common/Test_Common.hpp @@ -8,7 +8,6 @@ // #include #include #include -#include #include #include #include diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp index 1580a0c98b..f0320cb637 100644 --- a/unit_test/common/Test_Common_Sorting.hpp +++ b/unit_test/common/Test_Common_Sorting.hpp @@ -525,226 +525,6 @@ void testBitonicSortLexicographic() { ASSERT_TRUE(ordered); } -template -void testSortCRS(default_lno_t numRows, default_lno_t numCols, - default_size_type nnz, bool doValues, bool doStructInterface) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - // Create a random matrix on device - // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this - // wouldn't test anything - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 2, numCols / 2); - auto rowmap = A.graph.row_map; - auto entries = A.graph.entries; - auto values = A.values; - Kokkos::View rowmapHost("rowmap host", - numRows + 1); - Kokkos::View entriesHost("sorted entries host", - nnz); - Kokkos::View valuesHost("sorted values host", - nnz); - Kokkos::deep_copy(rowmapHost, rowmap); - Kokkos::deep_copy(entriesHost, entries); - Kokkos::deep_copy(valuesHost, values); - struct ColValue { - ColValue() {} - ColValue(lno_t c, scalar_t v) : col(c), val(v) {} - bool operator<(const ColValue& rhs) const { return col < rhs.col; } - bool operator==(const ColValue& rhs) const { - return col == rhs.col && val == rhs.val; - } - lno_t col; - scalar_t val; - }; - // sort one row at a time on host using STL. - { - for (lno_t i = 0; i < numRows; i++) { - std::vector rowCopy; - for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++) - rowCopy.emplace_back(entriesHost(j), valuesHost(j)); - std::sort(rowCopy.begin(), rowCopy.end()); - // write sorted row back - for (size_t j = 0; j < rowCopy.size(); j++) { - entriesHost(rowmapHost(i) + j) = rowCopy[j].col; - valuesHost(rowmapHost(i) + j) = rowCopy[j].val; - } - } - } - // call the actual sort routine being tested - if (doValues) { - if (doStructInterface) { - KokkosKernels::sort_crs_matrix(A); - } else { - KokkosKernels::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); - } - } else { - if (doStructInterface) { - KokkosKernels::sort_crs_graph(A.graph); - } else { - KokkosKernels::sort_crs_graph( - A.graph.row_map, A.graph.entries); - } - } - // Copy to host and compare - Kokkos::View entriesOut("sorted entries host", - nnz); - Kokkos::View valuesOut("sorted values host", - nnz); - Kokkos::deep_copy(entriesOut, entries); - Kokkos::deep_copy(valuesOut, values); - for (size_type i = 0; i < nnz; i++) { - EXPECT_EQ(entriesHost(i), entriesOut(i)) - << "Sorted column indices are wrong!"; - if (doValues) { - EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!"; - } - } -} - -template -void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { - // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix, - size_type>; - using crsMat_Managed_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - const lno_t numRows = 50; - const lno_t numCols = numRows; - size_type nnz = numRows * 5; - // Create a random matrix on device - // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this - // wouldn't test anything - crsMat_Managed_t A_managed = - KokkosKernels::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 2, numCols / 2); - crsMat_t A(A_managed); - auto rowmap = A.graph.row_map; - auto entries = A.graph.entries; - auto values = A.values; - if (doValues) { - if (doStructInterface) { - KokkosKernels::sort_crs_matrix(A); - } else { - KokkosKernels::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); - } - } else { - if (doStructInterface) { - KokkosKernels::sort_crs_graph(A.graph); - } else { - KokkosKernels::sort_crs_graph( - A.graph.row_map, A.graph.entries); - } - } -} - -template -void testSortAndMerge() { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - using Kokkos::HostSpace; - using Kokkos::MemoryTraits; - using Kokkos::Unmanaged; - // Create a small CRS matrix on host - std::vector inRowmap = {0, 4, 4, 5, 7, 10}; - std::vector inEntries = { - 4, 3, 5, 3, // row 0 - // row 1 has no entries - 6, // row 2 - 2, 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector inValues = { - 1.5, 4, 1, -3, // row 0 - // row 1 - 2, // row 2 - -1, -2, // row 3 - 0, 3.5, -2.25 // row 4 - }; - lno_t nrows = 5; - lno_t ncols = 7; - size_type nnz = inEntries.size(); - Kokkos::View> hostInRowmap( - inRowmap.data(), nrows + 1); - Kokkos::View> hostInEntries( - inEntries.data(), nnz); - Kokkos::View> hostInValues( - inValues.data(), nnz); - rowmap_t devInRowmap("", nrows + 1); - entries_t devInEntries("", nnz); - values_t devInValues("", nnz); - Kokkos::deep_copy(devInRowmap, hostInRowmap); - Kokkos::deep_copy(devInEntries, hostInEntries); - Kokkos::deep_copy(devInValues, hostInValues); - crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, - devInEntries); - crsMat_t output = KokkosKernels::sort_and_merge_matrix(input); - exec_space().fence(); - EXPECT_EQ(output.numRows(), nrows); - EXPECT_EQ(output.numCols(), ncols); - auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - output.graph.row_map); - auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - output.graph.entries); - auto outValues = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); - // Expect 2 merges to have taken place - std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; - std::vector goldEntries = { - 3, 4, 5, // row 0 - // row 1 has no entries - 6, // row 2 - 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector goldValues = { - 1, 1.5, 1, // row 0 - // row 1 - 2, // row 2 - -3, // row 3 - 0, 3.5, -2.25 // row 4 - }; - EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); - EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); - EXPECT_EQ(goldValues.size(), outValues.extent(0)); - EXPECT_EQ(goldValues.size(), output.nnz()); - for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); - for (size_type i = 0; i < output.nnz(); i++) { - EXPECT_EQ(goldEntries[i], outEntries(i)); - EXPECT_EQ(goldValues[i], outValues(i)); - } -} - TEST_F(TestCategory, common_serial_radix) { // Test serial radix over some contiguous small arrays // 1st arg is #arrays, 2nd arg is max subarray size @@ -805,31 +585,4 @@ TEST_F(TestCategory, common_device_bitonic) { testBitonicSortLexicographic(); } -TEST_F(TestCategory, common_sort_crsgraph) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, false, doStructInterface); - testSortCRS(100, 100, 2000, false, doStructInterface); - testSortCRS(1000, 1000, 30000, false, doStructInterface); - testSortCRSUnmanaged(false, doStructInterface); - } -} - -TEST_F(TestCategory, common_sort_crsmatrix) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, true, doStructInterface); - testSortCRS(100, 100, 2000, true, doStructInterface); - testSortCRS(1000, 1000, 30000, true, doStructInterface); - testSortCRSUnmanaged(true, doStructInterface); - } -} - -TEST_F(TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false, false); - testSortCRS(1, 50000, 10000, true, false); -} - -TEST_F(TestCategory, common_sort_merge_crsmatrix) { - testSortAndMerge(); -} - #endif diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp index ef7c14a931..da86546862 100644 --- a/unit_test/graph/Test_Graph_graph_color.hpp +++ b/unit_test/graph/Test_Graph_graph_color.hpp @@ -47,8 +47,8 @@ #include "KokkosGraph_Distance1Color.hpp" #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" using namespace KokkosKernels; @@ -115,7 +115,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, // typedef typename lno_view_t::non_const_value_type size_type; lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); typename lno_view_t::non_const_type sym_xadj; diff --git a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp index ec718e9aa4..2fd64675ec 100644 --- a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp +++ b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp @@ -48,7 +48,7 @@ #include "KokkosGraph_Distance1Color.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" using namespace KokkosKernels; diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 70158941a8..45444cd136 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -49,8 +49,8 @@ #include "KokkosGraph_Distance2Color.hpp" #include "KokkosGraph_MIS2.hpp" #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" @@ -159,7 +159,7 @@ void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -216,7 +216,7 @@ void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -273,7 +273,7 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, KokkosKernelsHandle; // Generate graph - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); auto G = A.graph; rowmap_t t_rowmap("rowmap^T", numCols + 1); diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index ed3acc3b85..c1b5e179fe 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -50,7 +50,8 @@ #include "KokkosGraph_ExplicitCoarsening.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" @@ -122,7 +123,7 @@ void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, using rowmap_t = typename c_rowmap_t::non_const_type; using entries_t = typename c_entries_t::non_const_type; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -164,7 +165,7 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, using entries_t = typename c_entries_t::non_const_type; using labels_t = entries_t; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 65cbb40ca5..e75eb1ce6a 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -13,12 +13,14 @@ #include "Test_Sparse_spgemm_jacobi.hpp" #include "Test_Sparse_spgemm.hpp" #include "Test_Sparse_bspgemm.hpp" +#include "Test_Sparse_SortCrs.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" #include "Test_Sparse_spmv_blockcrs.hpp" #include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" +#include "Test_Sparse_Transpose.hpp" #include "Test_Sparse_TestUtils_RandCscMat.hpp" #include "Test_Sparse_csc2csr.hpp" diff --git a/unit_test/sparse/Test_Sparse_SortCrs.hpp b/unit_test/sparse/Test_Sparse_SortCrs.hpp new file mode 100644 index 0000000000..edae86304c --- /dev/null +++ b/unit_test/sparse/Test_Sparse_SortCrs.hpp @@ -0,0 +1,311 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Sparse_SortCrs.hpp +/// \brief Tests for sort_crs_matrix and sort_crs_graph in KokkosSparse_SortCrs.hpp + +#ifndef KOKKOSSPARSE_SORTCRSTEST_HPP +#define KOKKOSSPARSE_SORTCRSTEST_HPP + +#include +#include +#include +#include "KokkosSparse_IOUtils.hpp" +#include +#include +#include +#include +#include +#include + + + +template +void testSortCRS(default_lno_t numRows, default_lno_t numCols, + default_size_type nnz, bool doValues, bool doStructInterface) { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + // Create a random matrix on device + // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this + // wouldn't test anything + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + numRows, numCols, nnz, 2, numCols / 2); + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + Kokkos::View rowmapHost("rowmap host", + numRows + 1); + Kokkos::View entriesHost("sorted entries host", + nnz); + Kokkos::View valuesHost("sorted values host", + nnz); + Kokkos::deep_copy(rowmapHost, rowmap); + Kokkos::deep_copy(entriesHost, entries); + Kokkos::deep_copy(valuesHost, values); + struct ColValue { + ColValue() {} + ColValue(lno_t c, scalar_t v) : col(c), val(v) {} + bool operator<(const ColValue& rhs) const { return col < rhs.col; } + bool operator==(const ColValue& rhs) const { + return col == rhs.col && val == rhs.val; + } + lno_t col; + scalar_t val; + }; + // sort one row at a time on host using STL. + { + for (lno_t i = 0; i < numRows; i++) { + std::vector rowCopy; + for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++) + rowCopy.emplace_back(entriesHost(j), valuesHost(j)); + std::sort(rowCopy.begin(), rowCopy.end()); + // write sorted row back + for (size_t j = 0; j < rowCopy.size(); j++) { + entriesHost(rowmapHost(i) + j) = rowCopy[j].col; + valuesHost(rowmapHost(i) + j) = rowCopy[j].val; + } + } + } + // call the actual sort routine being tested + if (doValues) { + if (doStructInterface) { + KokkosSparse::sort_crs_matrix(A); + } else { + KokkosSparse::sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); + } + } else { + if (doStructInterface) { + KokkosSparse::sort_crs_graph(A.graph); + } else { + KokkosSparse::sort_crs_graph( + A.graph.row_map, A.graph.entries); + } + } + // Copy to host and compare + Kokkos::View entriesOut("sorted entries host", + nnz); + Kokkos::View valuesOut("sorted values host", + nnz); + Kokkos::deep_copy(entriesOut, entries); + Kokkos::deep_copy(valuesOut, values); + for (size_type i = 0; i < nnz; i++) { + EXPECT_EQ(entriesHost(i), entriesOut(i)) + << "Sorted column indices are wrong!"; + if (doValues) { + EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!"; + } + } +} + +template +void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { + // This test is about bug #960. + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix, + size_type>; + using crsMat_Managed_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + const lno_t numRows = 50; + const lno_t numCols = numRows; + size_type nnz = numRows * 5; + // Create a random matrix on device + // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this + // wouldn't test anything + crsMat_Managed_t A_managed = + KokkosSparse::Impl::kk_generate_sparse_matrix( + numRows, numCols, nnz, 2, numCols / 2); + crsMat_t A(A_managed); + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + if (doValues) { + if (doStructInterface) { + KokkosSparse::sort_crs_matrix(A); + } else { + KokkosSparse::sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); + } + } else { + if (doStructInterface) { + KokkosSparse::sort_crs_graph(A.graph); + } else { + KokkosSparse::sort_crs_graph( + A.graph.row_map, A.graph.entries); + } + } +} + +template +void testSortAndMerge() { + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + using Kokkos::HostSpace; + using Kokkos::MemoryTraits; + using Kokkos::Unmanaged; + // Create a small CRS matrix on host + std::vector inRowmap = {0, 4, 4, 5, 7, 10}; + std::vector inEntries = { + 4, 3, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + std::vector inValues = { + 1.5, 4, 1, -3, // row 0 + // row 1 + 2, // row 2 + -1, -2, // row 3 + 0, 3.5, -2.25 // row 4 + }; + lno_t nrows = 5; + lno_t ncols = 7; + size_type nnz = inEntries.size(); + Kokkos::View> hostInRowmap( + inRowmap.data(), nrows + 1); + Kokkos::View> hostInEntries( + inEntries.data(), nnz); + Kokkos::View> hostInValues( + inValues.data(), nnz); + rowmap_t devInRowmap("", nrows + 1); + entries_t devInEntries("", nnz); + values_t devInValues("", nnz); + Kokkos::deep_copy(devInRowmap, hostInRowmap); + Kokkos::deep_copy(devInEntries, hostInEntries); + Kokkos::deep_copy(devInValues, hostInValues); + crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, + devInEntries); + crsMat_t output = KokkosSparse::sort_and_merge_matrix(input); + exec_space().fence(); + EXPECT_EQ(output.numRows(), nrows); + EXPECT_EQ(output.numCols(), ncols); + auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + output.graph.row_map); + auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + output.graph.entries); + auto outValues = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); + // Expect 2 merges to have taken place + std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; + std::vector goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + std::vector goldValues = { + 1, 1.5, 1, // row 0 + // row 1 + 2, // row 2 + -3, // row 3 + 0, 3.5, -2.25 // row 4 + }; + EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); + EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); + EXPECT_EQ(goldValues.size(), outValues.extent(0)); + EXPECT_EQ(goldValues.size(), output.nnz()); + for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); + for (size_type i = 0; i < output.nnz(); i++) { + EXPECT_EQ(goldEntries[i], outEntries(i)); + EXPECT_EQ(goldValues[i], outValues(i)); + } +} + +TEST_F(TestCategory, common_sort_crsgraph) { + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + testSortCRS(10, 10, 20, false, doStructInterface); + testSortCRS(100, 100, 2000, false, doStructInterface); + testSortCRS(1000, 1000, 30000, false, doStructInterface); + testSortCRSUnmanaged(false, doStructInterface); + } +} + +TEST_F(TestCategory, common_sort_crsmatrix) { + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + testSortCRS(10, 10, 20, true, doStructInterface); + testSortCRS(100, 100, 2000, true, doStructInterface); + testSortCRS(1000, 1000, 30000, true, doStructInterface); + testSortCRSUnmanaged(true, doStructInterface); + } +} + +TEST_F(TestCategory, common_sort_crs_longrows) { + testSortCRS(1, 50000, 10000, false, false); + testSortCRS(1, 50000, 10000, true, false); +} + +TEST_F(TestCategory, common_sort_merge_crsmatrix) { + testSortAndMerge(); +} + +#endif // KOKKOSSPARSE_SORTCRSTEST_HPP diff --git a/unit_test/common/Test_Common_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp similarity index 95% rename from unit_test/common/Test_Common_Transpose.hpp rename to unit_test/sparse/Test_Sparse_Transpose.hpp index fba29da81d..7431d0c485 100644 --- a/unit_test/common/Test_Common_Transpose.hpp +++ b/unit_test/sparse/Test_Sparse_Transpose.hpp @@ -49,11 +49,12 @@ #include #include -#include -#include +#include #include +#include #include #include +#include template struct ExactCompare { @@ -85,7 +86,7 @@ void testTranspose(int numRows, int numCols, bool doValues) { using values_t = typename crsMat_t::values_type::non_const_type; size_type nnz = 10 * numRows; // Generate a matrix that has 0 entries in some rows - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 3 * 10, numRows / 2); // compute the transpose while unsorted, then transpose again rowmap_t t_rowmap("Rowmap^T", numCols + 1); // this view is initialized to 0 @@ -124,8 +125,8 @@ void testTranspose(int numRows, int numCols, bool doValues) { } // Sort both the transpose-transpose, and the original matrix (to compare // directly) - KokkosKernels::sort_crs_matrix(input_mat); - KokkosKernels::sort_crs_matrix( + KokkosSparse::sort_crs_matrix(input_mat); + KokkosSparse::sort_crs_matrix( tt_rowmap, tt_entries, tt_values); // The views should now be exactly identical, since they represent the same // matrix and are sorted diff --git a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp index 3d85ec394a..0ad16c54d0 100644 --- a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp +++ b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp @@ -7,7 +7,7 @@ #include #include -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" void test_cusparse_safe_call() { bool caught_exception = false; diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index cd90ec39ea..0f4c9b0d67 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -48,7 +48,8 @@ #include "KokkosKernels_TestUtils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include #include #include @@ -200,7 +201,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t block_size = params.block_size; crsMat_t crsmat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); lno_view_t pf_rm; @@ -288,7 +289,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t block_size = params.block_size; crsMat_t crsmat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); lno_view_t pf_rm; diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp index a3ec84fedf..7374ac6a78 100644 --- a/unit_test/sparse/Test_Sparse_bspgemm.hpp +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -45,10 +45,11 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "KokkosSparse_spgemm.hpp" #include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" using namespace KokkosSparse; @@ -120,8 +121,8 @@ bool is_same_block_matrix(bsrMat_t output_mat_actual, return false; } - KokkosKernels::sort_bsr_matrix(output_mat_actual); - KokkosKernels::sort_bsr_matrix(output_mat_reference); + KokkosSparse::sort_bsr_matrix(output_mat_actual); + KokkosSparse::sort_bsr_matrix(output_mat_reference); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -187,9 +188,9 @@ void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, // Generate random compressed sparse row matrix. Randomly generated (non-zero) // values are stored in a 1-D (1 rank) array. - bsrMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( blkDim, m, k, nnz, row_size_variance, bandwidth); - bsrMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( + bsrMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( blkDim, k, n, nnz, row_size_variance, bandwidth); const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 6e9661ea62..627a9fc99e 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -47,6 +47,7 @@ #include #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" //#include #include #include @@ -61,7 +62,7 @@ #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosSparse_sor_sequential_impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "KokkosKernels_TestUtils.hpp" // #ifndef kokkos_complex_double @@ -183,7 +184,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, srand(245); lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those @@ -272,7 +273,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those @@ -396,7 +397,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, crsMat_t; lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), input_mat.graph.row_map); @@ -472,7 +473,7 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, srand(245); size_type nnzTotal = nnzPerRow * numRows; lno_t nnzVariance = nnzPerRow / 4; - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numRows, nnzTotal, nnzVariance, bandwidth); lno_row_view_t symRowmap; lno_nnz_view_t symEntries; @@ -609,7 +610,7 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); - input_mat = KokkosKernels::sort_and_merge_matrix(input_mat); + input_mat = KokkosSparse::sort_and_merge_matrix(input_mat); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those // can be tested for symmetric=false) @@ -660,11 +661,11 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { const scalar_t one = Kokkos::ArithTraits::one(); size_type nnz = nnzPerRow * numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numRows, nnz, 0, numRows / 10, 2.0 * one); input_mat = Test::symmetrize(input_mat); - input_mat = KokkosKernels::sort_and_merge_matrix(input_mat); + input_mat = KokkosSparse::sort_and_merge_matrix(input_mat); scalar_view_t solution_x( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), numRows); create_random_x_vector(solution_x); diff --git a/unit_test/sparse/Test_Sparse_rocsparse.hpp b/unit_test/sparse/Test_Sparse_rocsparse.hpp index 27e0b1f9fd..fe1bf8e9b2 100644 --- a/unit_test/sparse/Test_Sparse_rocsparse.hpp +++ b/unit_test/sparse/Test_Sparse_rocsparse.hpp @@ -7,7 +7,7 @@ #include #include #include -#include "KokkosKernels_SparseUtils_rocsparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" void test_rocsparse_version() { // Print version diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index a7b9432857..a1e33c0ca6 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -45,8 +45,8 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" #include #include #include @@ -58,6 +58,7 @@ #include #include +#include // This file contains the matrix for test_issue402 #include "matrixIssue402.hpp" @@ -197,8 +198,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { return false; } - KokkosKernels::sort_crs_matrix(output_mat_actual); - KokkosKernels::sort_crs_matrix(output_mat_reference); + KokkosSparse::sort_crs_matrix(output_mat_actual); + KokkosSparse::sort_crs_matrix(output_mat_reference); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -264,9 +265,9 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, // Generate random compressed sparse row matrix. Randomly generated (non-zero) // values are stored in a 1-D (1 rank) array. - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( m, k, nnz, row_size_variance, bandwidth); - crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( k, n, nnz, row_size_variance, bandwidth); const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index 885b1a07fe..f9db6f4d8d 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -45,8 +45,8 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" #include #include #include @@ -58,6 +58,7 @@ #include #include +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; @@ -154,7 +155,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { size_t nentries2 = output_mat2.graph.entries.extent(0); size_t nvals2 = output_mat2.values.extent(0); - KokkosKernels::sort_crs_matrix(output_mat1); + KokkosSparse::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl; @@ -170,7 +171,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - KokkosKernels::sort_crs_matrix(output_mat2); + KokkosSparse::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -225,7 +226,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); crsMat_t output_mat2; diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp index 353543b751..8f9ef99063 100644 --- a/unit_test/sparse/Test_Sparse_spiluk.hpp +++ b/unit_test/sparse/Test_Sparse_spiluk.hpp @@ -49,7 +49,7 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include #include "KokkosBlas1_nrm2.hpp" diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 5cb729f311..8a15153dce 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "KokkosKernels_Controls.hpp" @@ -422,7 +423,7 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); lno_t nr = input_mat.numRows(); lno_t nc = input_mat.numCols(); @@ -513,7 +514,7 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::fill_random(b_xt, rand_pool, randomUpperBound(max_x)); Kokkos::fill_random(b_yt, rand_pool, randomUpperBound(max_y)); - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); const lno_t max_nnz_per_row = @@ -574,7 +575,7 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, constexpr mag_t max_y = static_cast(10); constexpr mag_t max_val = static_cast(10); - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numRows, nnz, row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool( 13718); @@ -889,7 +890,7 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); lno_t nr = input_mat.numRows(); lno_t nc = input_mat.numCols(); diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 0b175da13d..08c5494c88 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -50,7 +50,7 @@ #include #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 4b1f00c98a..776674344a 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -76,7 +77,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // this function creates a dense lower and upper triangular matrix. // TODO: SHOULD CHANGE IT TO SPARSE crsMat_t lower_part = - KokkosKernels::Impl::kk_generate_triangular_sparse_matrix( + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'L', numRows, numCols, nnz, row_size_variance, bandwidth); KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); @@ -86,7 +87,7 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // typedef typename Kokkos::View indexview; crsMat_t upper_part = - KokkosKernels::Impl::kk_generate_triangular_sparse_matrix( + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'U', numRows, numCols, nnz, row_size_variance, bandwidth); KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); From e2a88fccc4442a254a4c51cc782a191ca7130bfe Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 2 Jun 2022 17:44:54 -0600 Subject: [PATCH 171/261] Applying clang-format --- .../sparse/KokkosSparse_wiki_gauss_seidel.cpp | 99 ++++++++++--------- perf_test/graph/KokkosGraph_color.cpp | 6 +- .../sparse/KokkosSparse_multimem_spgemm.hpp | 12 +-- perf_test/sparse/KokkosSparse_pcg.cpp | 5 +- .../sparse/KokkosSparse_run_spgemm_jacobi.hpp | 12 +-- src/sparse/KokkosSparse_IOUtils.hpp | 22 +++-- src/sparse/KokkosSparse_SortCrs.hpp | 66 ++++++------- src/sparse/KokkosSparse_sptrsv_supernode.hpp | 2 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 2 +- unit_test/sparse/Test_Sparse_SortCrs.hpp | 7 +- 10 files changed, 114 insertions(+), 119 deletions(-) diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 57b8ddd4ec..ce171c46bd 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -8,83 +8,90 @@ #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosBlas1_nrm2.hpp" -//Parallel Gauss-Seidel Preconditioner/Smoother +// Parallel Gauss-Seidel Preconditioner/Smoother // -Uses graph coloring to find independent row sets, // and applies GS to each set in parallel // -Here, use to solve a diagonally dominant linear system directly. -//Helper to print out colors in the shape of the grid -int main() -{ - using Scalar = default_scalar; - using Mag = Kokkos::ArithTraits::mag_type; - using Ordinal = default_lno_t; - using Offset = default_size_type; +// Helper to print out colors in the shape of the grid +int main() { + using Scalar = default_scalar; + using Mag = Kokkos::ArithTraits::mag_type; + using Ordinal = default_lno_t; + using Offset = default_size_type; using ExecSpace = Kokkos::DefaultExecutionSpace; - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; - using Handle = KokkosKernels::Experimental:: - KokkosKernelsHandle; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< + Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; - const Scalar one = Kokkos::ArithTraits::one(); - const Mag magOne = Kokkos::ArithTraits::one(); - //Solve tolerance + const Scalar one = Kokkos::ArithTraits::one(); + const Mag magOne = Kokkos::ArithTraits::one(); + // Solve tolerance const Mag tolerance = 1e-6 * magOne; Kokkos::initialize(); { - //Generate a square, strictly diagonally dominant, but nonsymmetric matrix on which Gauss-Seidel should converge. - //Get approx. 20 entries per row - //Diagonals are 2x the absolute sum of all other entries. + // Generate a square, strictly diagonally dominant, but nonsymmetric matrix + // on which Gauss-Seidel should converge. Get approx. 20 entries per row + // Diagonals are 2x the absolute sum of all other entries. Offset nnz = numRows * 20; - Matrix A = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, 1.05 * one); - std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n"; - //Create a kernel handle, then a Gauss-Seidel handle with the default algorithm + Matrix A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one); + std::cout << "Generated a matrix with " << numRows << " rows/cols, and " + << nnz << " entries.\n"; + // Create a kernel handle, then a Gauss-Seidel handle with the default + // algorithm Handle handle; handle.create_gs_handle(KokkosSparse::GS_DEFAULT); - //Set up Gauss-Seidel for the graph (matrix sparsity pattern) - KokkosSparse::Experimental::gauss_seidel_symbolic(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); - //Set up Gauss-Seidel for the matrix values (numeric) - //Another matrix with the same sparsity pattern could re-use the handle and symbolic phase, and only call numeric. - KokkosSparse::Experimental::gauss_seidel_numeric(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, false); - //Now, preconditioner is ready to use. Set up an unknown vector (uninitialized) and randomized right-hand-side vector. + // Set up Gauss-Seidel for the graph (matrix sparsity pattern) + KokkosSparse::Experimental::gauss_seidel_symbolic( + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); + // Set up Gauss-Seidel for the matrix values (numeric) + // Another matrix with the same sparsity pattern could re-use the handle and + // symbolic phase, and only call numeric. + KokkosSparse::Experimental::gauss_seidel_numeric( + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, + false); + // Now, preconditioner is ready to use. Set up an unknown vector + // (uninitialized) and randomized right-hand-side vector. Vector x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), numRows); Vector b(Kokkos::view_alloc(Kokkos::WithoutInitializing, "b"), numRows); Vector res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "res"), numRows); auto bHost = Kokkos::create_mirror_view(b); - for(Ordinal i = 0; i < numRows; i++) + for (Ordinal i = 0; i < numRows; i++) bHost(i) = 3 * ((one * rand()) / RAND_MAX); Kokkos::deep_copy(b, bHost); - //Measure initial residual norm ||Ax - b||, where x is 0 - Mag initialRes = KokkosBlas::nrm2(b); + // Measure initial residual norm ||Ax - b||, where x is 0 + Mag initialRes = KokkosBlas::nrm2(b); Mag scaledResNorm = magOne; - bool firstIter = true; - //Iterate until reaching the tolerance + bool firstIter = true; + // Iterate until reaching the tolerance int numIters = 0; - while(scaledResNorm > tolerance) - { - //Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0) - //If this is the first iteration, tell apply: + while (scaledResNorm > tolerance) { + // Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0) + // If this is the first iteration, tell apply: // * to zero out x (it was uninitialized) - // * that b has changed since the previous apply (since there was no previous apply) + // * that b has changed since the previous apply (since there was no + // previous apply) KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &handle, numRows, numRows, - A.graph.row_map, A.graph.entries, A.values, + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, x, b, firstIter, firstIter, one, 1); firstIter = false; - //Now, compute the new residual norm using SPMV + // Now, compute the new residual norm using SPMV Kokkos::deep_copy(res, b); - //Compute res := Ax - res (since res is now equal to b, this is Ax - b) + // Compute res := Ax - res (since res is now equal to b, this is Ax - b) KokkosSparse::spmv("N", one, A, x, -one, res); - //Recompute the scaled norm + // Recompute the scaled norm scaledResNorm = KokkosBlas::nrm2(res) / initialRes; numIters++; - std::cout << "Iteration " << numIters << " scaled residual norm: " << scaledResNorm << '\n'; + std::cout << "Iteration " << numIters + << " scaled residual norm: " << scaledResNorm << '\n'; } std::cout << "SUCCESS: converged in " << numIters << " iterations.\n"; } Kokkos::finalize(); return 0; } - diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 7c6dda889f..cc19c19675 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -377,16 +377,14 @@ void run_multi_mem_experiment(Parameters params) { if (params.a_mem_space == 1) { fast_crstmat_t a_fast_crsmat; a_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); a_fast_crsgraph = a_fast_crsmat.graph; num_cols = a_fast_crsmat.numCols(); } else { slow_crstmat_t a_slow_crsmat; a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); a_slow_crsgraph = a_slow_crsmat.graph; num_cols = a_slow_crsmat.numCols(); } diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp index 78520d64eb..d7ae6da430 100644 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp @@ -75,12 +75,10 @@ void run_multi_mem_spgemm(Parameters params) { if (params.a_mem_space == 1) { a_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } else { a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && @@ -91,13 +89,11 @@ void run_multi_mem_spgemm(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } if (params.a_mem_space == 1) { diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index a98a8fcec8..b485158125 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -264,9 +264,8 @@ void run_pcg(int *cmdline, const char *mtx_file) { default_lno_t *xadj, *adj; default_scalar *ew; - KokkosSparse::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, - mtx_file); + KokkosSparse::Impl::read_matrix( + &nv, &ne, &xadj, &adj, &ew, mtx_file); typedef typename KokkosSparse::CrsMatrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } else { a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && @@ -354,13 +352,11 @@ void run_spgemm_jacobi(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } if (params.a_mem_space == 1) { diff --git a/src/sparse/KokkosSparse_IOUtils.hpp b/src/sparse/KokkosSparse_IOUtils.hpp index d847fc9d10..fa6d08f960 100644 --- a/src/sparse/KokkosSparse_IOUtils.hpp +++ b/src/sparse/KokkosSparse_IOUtils.hpp @@ -497,7 +497,8 @@ void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs, #include #include #include - __gnu_parallel::parallel_sort_mwms *>( + __gnu_parallel::parallel_sort_mwms< + false, true, struct KokkosKernels::Impl::Edge *>( &(edges[0]), &(edges[0]) + ne * 2, std::less>(), 64); #else @@ -805,7 +806,8 @@ void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) { scalar_t *a_values = a_values_view.data(); std::string strfilename(filename); - if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm")) { + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm")) { write_matrix_mtx( a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap, a_entries, a_values, filename); @@ -971,7 +973,8 @@ int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, numEdges = 2 * nnz; } // numEdges is only an upper bound (diagonal entries may be removed) - std::vector> edges(numEdges); + std::vector> edges( + numEdges); size_type nE = 0; lno_t numDiagonal = 0; for (size_type i = 0; i < nnz; ++i) { @@ -1076,7 +1079,8 @@ template void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, scalar_t **ew, const char *filename) { std::string strfilename(filename); - if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm")) { + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm")) { read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); } @@ -1096,8 +1100,8 @@ void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, template crsMat_t read_kokkos_crst_matrix(const char *filename_) { std::string strfilename(filename_); - bool isMatrixMarket = - KokkosKernels::Impl::endswith(strfilename, ".mtx") || KokkosKernels::Impl::endswith(strfilename, ".mm"); + bool isMatrixMarket = KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm"); typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type::non_const_type row_map_view_t; @@ -1265,6 +1269,6 @@ inline void kk_sequential_create_incidence_matrix_transpose( } } -} // namespace Impl -} // namespace KokkosKernels -#endif // _KOKKOSSPARSE_IOUTILS_HPP +} // namespace Impl +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_IOUTILS_HPP diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp index 03d51386e5..11e3b43acb 100644 --- a/src/sparse/KokkosSparse_SortCrs.hpp +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -392,8 +392,8 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if (numRows == 0) return; Impl::SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); @@ -472,8 +472,8 @@ template void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; - bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if (numRows == 0) return; Impl::SortCrsGraphFunctor funct( useRadix, rowmap, entries); @@ -531,8 +531,8 @@ crsMat_t sort_and_merge_matrix(const crsMat_t& A) { mergedRowmap, A.graph.row_map, A.graph.entries), numCompressedEntries); // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(A.numRows() + 1, - mergedRowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + A.numRows() + 1, mergedRowmap); entries_t mergedEntries("SortedMerged entries", numCompressedEntries); values_t mergedValues("SortedMerged values", numCompressedEntries); // Compute merged entries and values @@ -576,8 +576,8 @@ void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, rowmap_out, rowmap_in, entries_in), numCompressedEntries); // Prefix sum to get rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(numRows + 1, - rowmap_out); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + numRows + 1, rowmap_out); entries_out = entries_t("SortedMerged entries", numCompressedEntries); // Compute merged entries and values Kokkos::parallel_for( @@ -601,7 +601,7 @@ crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { return crsGraph_t(mergedEntries, mergedRowmap); } -} // namespace KokkosSparse +} // namespace KokkosSparse namespace KokkosKernels { @@ -614,15 +614,15 @@ namespace KokkosKernels { template -[[deprecated]] -void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, - const entries_t& entries, const values_t& values) { +[[deprecated]] void sort_bsr_matrix(const lno_t blockdim, + const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values); } template -[[deprecated]] -void sort_bsr_matrix(const bsrMat_t& A) { +[[deprecated]] void sort_bsr_matrix(const bsrMat_t& A) { KokkosSparse::sort_bsr_matrix(A); } @@ -635,27 +635,25 @@ void sort_bsr_matrix(const bsrMat_t& A) { template -[[deprecated]] -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { +[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { KokkosSparse::sort_crs_matrix(rowmap, entries, values); } template -[[deprecated]] -void sort_crs_matrix(const crsMat_t& A) { +[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { KokkosSparse::sort_crs_matrix(A); } template -[[deprecated]] -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { +[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, + const entries_t& entries) { KokkosSparse::sort_crs_graph(rowmap, entries); } template -[[deprecated]] -void sort_crs_graph(const crsGraph_t& G) { +[[deprecated]] void sort_crs_graph(const crsGraph_t& G) { KokkosSparse::sort_crs_graph(G); } @@ -663,23 +661,21 @@ void sort_crs_graph(const crsGraph_t& G) { // sorted and has no duplicated entries: each (i, j) is unique. Values for // duplicated entries are summed. template -[[deprecated]] -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { +[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { KokkosSparse::sort_and_merge_matrix(A); } template -[[deprecated]] -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { +[[deprecated]] crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { KokkosSparse::sort_and_merge_graph(G); } template -[[deprecated]] -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out) { - KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, entries_out); +[[deprecated]] void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) { + KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, + entries_out); } // For backward compatibility: keep the public interface accessible in @@ -719,7 +715,7 @@ template return KokkosKernels::sort_and_merge_matrix(A); } -} // namespace Impl -} // namespace KokkosKernels +} // namespace Impl +} // namespace KokkosKernels -#endif // _KOKKOSSPARSE_SORTCRS_HPP +#endif // _KOKKOSSPARSE_SORTCRS_HPP diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index 1c86121bde..481bd2cc0a 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -598,7 +598,7 @@ host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, // sort column ids per row KokkosSparse::sort_crs_graph(hr, hc); + row_map_view_host_t, cols_view_host_t>(hr, hc); #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time_seconds = timer.seconds(); std::cout << " > Generate Supernodal Graph: sort graph : " diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 6adafd6319..d779ff3e96 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -858,7 +858,7 @@ class TwostageGaussSeidel { entries_view_t, values_view_t>( rowmap_viewL, column_viewL, values_viewL); KokkosSparse::sort_crs_matrix( + entries_view_t, values_view_t>( rowmap_viewU, column_viewU, values_viewU); // now do symbolic diff --git a/unit_test/sparse/Test_Sparse_SortCrs.hpp b/unit_test/sparse/Test_Sparse_SortCrs.hpp index edae86304c..a4d30b40a1 100644 --- a/unit_test/sparse/Test_Sparse_SortCrs.hpp +++ b/unit_test/sparse/Test_Sparse_SortCrs.hpp @@ -43,7 +43,8 @@ */ /// \file Test_Sparse_SortCrs.hpp -/// \brief Tests for sort_crs_matrix and sort_crs_graph in KokkosSparse_SortCrs.hpp +/// \brief Tests for sort_crs_matrix and sort_crs_graph in +/// KokkosSparse_SortCrs.hpp #ifndef KOKKOSSPARSE_SORTCRSTEST_HPP #define KOKKOSSPARSE_SORTCRSTEST_HPP @@ -59,8 +60,6 @@ #include #include - - template void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues, bool doStructInterface) { @@ -308,4 +307,4 @@ TEST_F(TestCategory, common_sort_merge_crsmatrix) { testSortAndMerge(); } -#endif // KOKKOSSPARSE_SORTCRSTEST_HPP +#endif // KOKKOSSPARSE_SORTCRSTEST_HPP From be71d80e81ab4c80213b8c535a8b34939010d30f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Jun 2022 11:02:44 -0600 Subject: [PATCH 172/261] common cleanup: fixing issue with sparse performance tests Some tests had not been compiled on my local machine due to the instantition guards in these tests. Now that the types are enabled the issue was reproduced and fixed. --- perf_test/sparse/KokkosSparse_block_pcg.cpp | 6 +++--- perf_test/sparse/KokkosSparse_spadd.cpp | 16 ++++++++-------- src/common/KokkosKernels_IOUtils.hpp | 1 + 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 89ab0bfdca..25d7a65fdd 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -50,7 +50,7 @@ #include "KokkosSparse_pcg.hpp" #include "KokkosKernels_Utils.hpp" -#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -75,7 +75,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { if (std::string(mtx_bin_file) == "auto") { INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40; - crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsmat = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(num_rows, num_cols, nnz, 3, 5); printf("generating test matrix automatically\n"); printf(" num rows: %d", num_rows); @@ -86,7 +86,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { INDEX_TYPE *xadj, *adj; SCALAR_TYPE *ew; - KokkosKernels::Impl::read_matrix( + KokkosSparse::Impl::read_matrix( &nv, &ne, &xadj, &adj, &ew, mtx_bin_file); row_map_view_t rowmap_view("rowmap_view", nv + 1); diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 963ada8836..877b3c5df1 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -45,7 +45,7 @@ #include #include "KokkosKernels_config.h" #include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils_cusparse.hpp" #include "KokkosSparse_Utils_mkl.hpp" #include "KokkosSparse_spadd.hpp" @@ -111,19 +111,19 @@ void run_experiment(const Params& params) { lno_t n = params.n; if (params.amtx.length()) { std::cout << "Loading A from " << params.amtx << '\n'; - A = KokkosKernels::Impl::read_kokkos_crst_matrix( + A = KokkosSparse::Impl::read_kokkos_crst_matrix( params.amtx.c_str()); m = A.numRows(); n = A.numCols(); } else { std::cout << "Randomly generating A\n"; size_type nnzUnused = m * params.nnzPerRow; - A = KokkosKernels::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( m, n, nnzUnused, 0, (n + 3) / 3); } if (params.bmtx.length()) { std::cout << "Loading B from " << params.bmtx << '\n'; - B = KokkosKernels::Impl::read_kokkos_crst_matrix( + B = KokkosSparse::Impl::read_kokkos_crst_matrix( params.bmtx.c_str()); } else if (params.bDiag) { std::cout << "Generating B as diagonal matrix.\n"; @@ -154,7 +154,7 @@ void run_experiment(const Params& params) { } else { std::cout << "Randomly generating B\n"; size_type nnzUnused = m * params.nnzPerRow; - B = KokkosKernels::Impl::kk_generate_sparse_matrix( + B = KokkosSparse::Impl::kk_generate_sparse_matrix( m, n, nnzUnused, 0, (n + 3) / 3); } // Make sure dimensions are compatible @@ -186,8 +186,8 @@ void run_experiment(const Params& params) { if (params.sorted) { std::cout << "Assuming input matrices are sorted (explicitly sorting just " "in case)\n"; - KokkosKernels::sort_crs_matrix(A); - KokkosKernels::sort_crs_matrix(B); + KokkosSparse::sort_crs_matrix(A); + KokkosSparse::sort_crs_matrix(B); } else std::cout << "Assuming input matrices are not sorted.\n"; kh.create_spadd_handle(params.sorted); @@ -363,7 +363,7 @@ void run_experiment(const Params& params) { std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx << "\n"; crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC); - KokkosKernels::Impl::write_kokkos_crst_matrix( + KokkosSparse::Impl::write_kokkos_crst_matrix( C, params.cmtx.c_str()); } } diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index fe72d0cbf3..42f31af65a 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -269,6 +269,7 @@ inline void kk_read_3Dview_from_file(idx_array_type &view, } template +[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, const wt *ew, const char *filename) { std::ofstream myFile(filename, std::ios::out | std::ios::binary); From a64734939a9fe109a7bbe90dabc651159bc40429 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Jun 2022 14:21:09 -0600 Subject: [PATCH 173/261] common cleanup: fixing an issue with a default template redefinition --- src/sparse/KokkosSparse_SortCrs.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp index 11e3b43acb..97bad80f39 100644 --- a/src/sparse/KokkosSparse_SortCrs.hpp +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -435,8 +435,7 @@ void sort_crs_matrix(const crsMat_t& A) { // Sort a BRS matrix: within each row, sort entries ascending by column and // permute the values accordingly. template + typename values_t, typename lno_t> void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, const entries_t& entries, const values_t& values) { // TODO: this is O(N^2) mock for debugging - do regular implementation based From 873781a9ca01d84d7b5fab5e2129308b34639877 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Jun 2022 17:13:14 -0600 Subject: [PATCH 174/261] ArithTraits: improving macros and generating __float128 with macro Using macro to implement __float128 after Kokkos PR #5081 merged. Also improving macros for complex and integral types, making these almost completely auto-generated by the macro with the exception of a few definitions and the name() method. --- src/common/Kokkos_ArithTraits.hpp | 399 +++++------------------------- 1 file changed, 67 insertions(+), 332 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 7a0a9160c8..ff1e9b6aac 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -229,8 +229,8 @@ namespace Details { // in the ArithTraits struct for real floating point types, hopefully // this can be expanded to Kokkos::half_t and Kokkos::bhalf_t #define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ - static FUNC_QUAL val_type zero() { return static_cast(0.0); } \ - static FUNC_QUAL val_type one() { return static_cast(1.0); } \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ static FUNC_QUAL val_type min() { \ return Kokkos::Experimental::finite_min::value; \ } \ @@ -275,8 +275,8 @@ namespace Details { static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ - static FUNC_QUAL mag_type real(const val_type x) { return x; } \ - static FUNC_QUAL mag_type imag(const val_type) { return zero(); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ static FUNC_QUAL val_type conj(const val_type x) { return x; } \ static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ return Kokkos::pow(x, y); \ @@ -309,6 +309,25 @@ namespace Details { static FUNC_QUAL mag_type eps() { return epsilon(); } #define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_signed = true; \ + static constexpr bool is_integer = false; \ + static constexpr bool is_exact = false; \ + static constexpr bool is_complex = true; \ + static constexpr bool has_infinity = true; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = ::Kokkos::complex::halfPrecision>; \ + using doublePrecision = \ + ::Kokkos::complex::doublePrecision>; \ + \ + static constexpr bool isComplex = true; \ + static constexpr bool isOrdinal = false; \ + static constexpr bool isComparable = false; \ + static constexpr bool hasMachineParameters = \ + ArithTraits::hasMachineParameters; \ + \ static FUNC_QUAL val_type zero() { \ return val_type(ArithTraits::zero(), \ ArithTraits::zero()); \ @@ -402,6 +421,22 @@ namespace Details { static KOKKOS_FUNCTION mag_type abs(const val_type x) { return x; } #define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS) \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_exact = true; \ + static constexpr bool is_complex = false; \ + static constexpr bool has_infinity = false; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = val_type; \ + using doublePrecision = val_type; \ + \ + static constexpr bool isComplex = false; \ + static constexpr bool isOrdinal = true; \ + static constexpr bool isComparable = true; \ + static constexpr bool hasMachineParameters = false; \ + \ static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ static KOKKOS_FUNCTION val_type min() { \ @@ -416,7 +451,7 @@ namespace Details { static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ KOKKOSKERNELS_ABS \ - static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { return Kokkos::real(x); } \ static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ @@ -1303,30 +1338,45 @@ class ArithTraits { KOKKOSKERNELS_ARITHTRAITS_REAL_FP() }; // long double specialization +#if defined(KOKKOS_ENABLE_LIBQUADMATH) +// CUDA does not support __float128 in device functions, so none of +// the class methods in this specialization are marked as device +// functions. template <> -class ArithTraits< ::Kokkos::complex > { +class ArithTraits<__float128> { public: - using val_type = ::Kokkos::complex; - using mag_type = float; + using val_type = __float128; + using mag_type = val_type; static constexpr bool is_specialized = true; static constexpr bool is_signed = true; static constexpr bool is_integer = false; static constexpr bool is_exact = false; - static constexpr bool is_complex = true; + static constexpr bool is_complex = false; static constexpr bool has_infinity = true; // Backwards compatibility with Teuchos::ScalarTraits. using magnitudeType = mag_type; - using halfPrecision = ::Kokkos::complex::halfPrecision>; - using doublePrecision = - ::Kokkos::complex::doublePrecision>; + using halfPrecision = double; + // Unfortunately, we can't rely on a standard __float256 type. + using doublePrecision = __float128; - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = - ArithTraits::hasMachineParameters; + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "__float128"; } + + KOKKOSKERNELS_ARITHTRAITS_REAL_FP() +}; // __float128 specialization +#endif // KOKKOS_ENABLE_LIBQUADMATH + +template <> +class ArithTraits< ::Kokkos::complex > { + public: + using val_type = ::Kokkos::complex; + using mag_type = float; static std::string name() { return "Kokkos::complex"; } @@ -1339,26 +1389,6 @@ class ArithTraits< ::Kokkos::complex > { using val_type = ::Kokkos::complex; using mag_type = double; - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool is_complex = true; - - static constexpr bool has_infinity = true; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = ::Kokkos::complex::halfPrecision>; - using doublePrecision = - ::Kokkos::complex::doublePrecision>; - - static constexpr bool isComplex = true; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = false; - static constexpr bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static std::string name() { return "Kokkos::complex"; } KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) @@ -1604,152 +1634,17 @@ class ArithTraits > { static mag_type rmax() { return ArithTraits::rmax(); } }; -#if defined(KOKKOS_ENABLE_LIBQUADMATH) -// CUDA does not support __float128 in device functions, so none of -// the class methods in this specialization are marked as device -// functions. -template <> -class ArithTraits<__float128> { - public: - using val_type = __float128; - using mag_type = val_type; - - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool is_complex = false; - static constexpr bool has_infinity = true; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = double; - // Unfortunately, we can't rely on a standard __float256 type. - using doublePrecision = __float128; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = false; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = true; - - static val_type zero() { return static_cast(0.0); } - static val_type one() { return static_cast(1.0); } - static val_type min() { - return Kokkos::Experimental::finite_min::value; - } - static val_type max() { - return Kokkos::Experimental::finite_max::value; - } - static val_type infinity() { - return Kokkos::Experimental::infinity::value; - } - static val_type nan() { return Kokkos::Experimental::nanq(""); } - static mag_type epsilon() { - return Kokkos::Experimental::epsilon::value; - } - static mag_type sfmin() { - return Kokkos::Experimental::norm_min::value; - } - static int base() { return Kokkos::Experimental::radix::value; } - static mag_type prec() { return epsilon() * static_cast(base()); } - static int t() { return Kokkos::Experimental::digits::value; } - static mag_type rnd() { return static_cast(1.0); } - static int emin() { - return Kokkos::Experimental::min_exponent::value; - } - static mag_type rmin() { - return Kokkos::Experimental::norm_min::value; - } - static int emax() { - return Kokkos::Experimental::max_exponent::value; - } - static mag_type rmax() { - return Kokkos::Experimental::finite_max::value; - // return Kokkos::Experimental::norm_max::value; - } - - // Math Functions - static bool isInf(const val_type x) { return Kokkos::Experimental::isinf(x); } - static bool isNan(const val_type x) { return Kokkos::Experimental::isnan(x); } - static mag_type abs(const val_type x) { - return Kokkos::Experimental::fabs(x); - } - static mag_type real(const val_type x) { return x; } - static mag_type imag(const val_type /* x */) { return zero(); } - static val_type conj(const val_type x) { return x; } - // static val_type pow(const val_type x, const val_type y) { - // return Kokkos::Experimental::pow(x, y); - // } - static val_type sqrt(const val_type x) { - return Kokkos::Experimental::sqrt(x); - } - static val_type cbrt(const val_type x) { - return Kokkos::Experimental::cbrt(x); - } - static val_type exp(const val_type x) { return Kokkos::Experimental::exp(x); } - static val_type log(const val_type x) { return Kokkos::Experimental::log(x); } - static val_type log10(const val_type x) { - return Kokkos::Experimental::log10(x); - } - static val_type sin(const val_type x) { return Kokkos::Experimental::sin(x); } - static val_type cos(const val_type x) { return Kokkos::Experimental::cos(x); } - static val_type tan(const val_type x) { return Kokkos::Experimental::tan(x); } - static val_type sinh(const val_type x) { - return Kokkos::Experimental::sinh(x); - } - static val_type cosh(const val_type x) { - return Kokkos::Experimental::cosh(x); - } - static val_type tanh(const val_type x) { - return Kokkos::Experimental::tanh(x); - } - static val_type asin(const val_type x) { - return Kokkos::Experimental::asin(x); - } - static val_type acos(const val_type x) { - return Kokkos::Experimental::acos(x); - } - static val_type atan(const val_type x) { - return Kokkos::Experimental::atan(x); - } - - // Aliases - static bool isnaninf(const val_type x) { return isNan(x) || isInf(x); } - static magnitudeType magnitude(const val_type x) { return abs(x); } - static val_type conjugate(const val_type x) { return conj(x); } - static std::string name() { return "__float128"; } - static val_type squareroot(const val_type x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } -}; // __float128 specialization -#endif // KOKKOS_ENABLE_LIBQUADMATH - template <> class ArithTraits { public: using val_type = char; using mag_type = val_type; - static constexpr bool is_specialized = true; // The C(++) standard does not require that char be signed. In // fact, signed char, unsigned char, and char are distinct types. // We can use std::numeric_limits here because it's a const bool, // not a class method. static constexpr bool is_signed = std::numeric_limits::is_signed; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "char"; } @@ -1762,23 +1657,7 @@ class ArithTraits { using val_type = signed char; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "signed char"; } @@ -1791,23 +1670,7 @@ class ArithTraits { using val_type = unsigned char; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = false; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "unsigned char"; } @@ -1820,23 +1683,7 @@ class ArithTraits { using val_type = short; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "short"; } @@ -1849,23 +1696,7 @@ class ArithTraits { using val_type = unsigned short; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = false; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "unsigned short"; } @@ -1878,23 +1709,7 @@ class ArithTraits { using val_type = int; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "int"; } @@ -1907,23 +1722,7 @@ class ArithTraits { using val_type = unsigned int; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = false; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "unsigned int"; } @@ -1936,23 +1735,7 @@ class ArithTraits { using val_type = long; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "long"; } @@ -1965,23 +1748,7 @@ class ArithTraits { using val_type = unsigned long; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = false; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "unsigned long"; } @@ -1994,23 +1761,7 @@ class ArithTraits { using val_type = long long; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = true; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "long long"; } @@ -2023,23 +1774,7 @@ class ArithTraits { using val_type = unsigned long long; using mag_type = val_type; - static constexpr bool is_specialized = true; static constexpr bool is_signed = false; - static constexpr bool is_integer = true; - static constexpr bool is_exact = true; - static constexpr bool is_complex = false; - - static constexpr bool has_infinity = false; - - // Backwards compatibility with Teuchos::ScalarTraits. - using magnitudeType = mag_type; - using halfPrecision = val_type; - using doublePrecision = val_type; - - static constexpr bool isComplex = false; - static constexpr bool isOrdinal = true; - static constexpr bool isComparable = true; - static constexpr bool hasMachineParameters = false; static std::string name() { return "unsigned long long"; } From 37f68866b7dc75124bd647c2aa4980c49b253852 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 6 Jun 2022 17:02:11 -0600 Subject: [PATCH 175/261] ArithTraits: clang-format --- src/common/Kokkos_ArithTraits.hpp | 65 ++++++++++++++++--------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index ff1e9b6aac..1246dd0ed3 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -309,25 +309,26 @@ namespace Details { static FUNC_QUAL mag_type eps() { return epsilon(); } #define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ - \ - static constexpr bool is_specialized = true; \ - static constexpr bool is_signed = true; \ - static constexpr bool is_integer = false; \ - static constexpr bool is_exact = false; \ - static constexpr bool is_complex = true; \ - static constexpr bool has_infinity = true; \ - \ - using magnitudeType = mag_type; \ - using halfPrecision = ::Kokkos::complex::halfPrecision>; \ - using doublePrecision = \ - ::Kokkos::complex::doublePrecision>; \ - \ - static constexpr bool isComplex = true; \ - static constexpr bool isOrdinal = false; \ - static constexpr bool isComparable = false; \ - static constexpr bool hasMachineParameters = \ - ArithTraits::hasMachineParameters; \ - \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_signed = true; \ + static constexpr bool is_integer = false; \ + static constexpr bool is_exact = false; \ + static constexpr bool is_complex = true; \ + static constexpr bool has_infinity = true; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = \ + ::Kokkos::complex::halfPrecision>; \ + using doublePrecision = \ + ::Kokkos::complex::doublePrecision>; \ + \ + static constexpr bool isComplex = true; \ + static constexpr bool isOrdinal = false; \ + static constexpr bool isComparable = false; \ + static constexpr bool hasMachineParameters = \ + ArithTraits::hasMachineParameters; \ + \ static FUNC_QUAL val_type zero() { \ return val_type(ArithTraits::zero(), \ ArithTraits::zero()); \ @@ -451,7 +452,9 @@ namespace Details { static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ KOKKOSKERNELS_ABS \ - static KOKKOS_FUNCTION mag_type real(const val_type x) { return Kokkos::real(x); } \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { \ + return Kokkos::real(x); \ + } \ static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ @@ -1644,7 +1647,7 @@ class ArithTraits { // fact, signed char, unsigned char, and char are distinct types. // We can use std::numeric_limits here because it's a const bool, // not a class method. - static constexpr bool is_signed = std::numeric_limits::is_signed; + static constexpr bool is_signed = std::numeric_limits::is_signed; static std::string name() { return "char"; } @@ -1657,7 +1660,7 @@ class ArithTraits { using val_type = signed char; using mag_type = val_type; - static constexpr bool is_signed = true; + static constexpr bool is_signed = true; static std::string name() { return "signed char"; } @@ -1670,7 +1673,7 @@ class ArithTraits { using val_type = unsigned char; using mag_type = val_type; - static constexpr bool is_signed = false; + static constexpr bool is_signed = false; static std::string name() { return "unsigned char"; } @@ -1683,7 +1686,7 @@ class ArithTraits { using val_type = short; using mag_type = val_type; - static constexpr bool is_signed = true; + static constexpr bool is_signed = true; static std::string name() { return "short"; } @@ -1696,7 +1699,7 @@ class ArithTraits { using val_type = unsigned short; using mag_type = val_type; - static constexpr bool is_signed = false; + static constexpr bool is_signed = false; static std::string name() { return "unsigned short"; } @@ -1709,7 +1712,7 @@ class ArithTraits { using val_type = int; using mag_type = val_type; - static constexpr bool is_signed = true; + static constexpr bool is_signed = true; static std::string name() { return "int"; } @@ -1722,7 +1725,7 @@ class ArithTraits { using val_type = unsigned int; using mag_type = val_type; - static constexpr bool is_signed = false; + static constexpr bool is_signed = false; static std::string name() { return "unsigned int"; } @@ -1735,7 +1738,7 @@ class ArithTraits { using val_type = long; using mag_type = val_type; - static constexpr bool is_signed = true; + static constexpr bool is_signed = true; static std::string name() { return "long"; } @@ -1748,7 +1751,7 @@ class ArithTraits { using val_type = unsigned long; using mag_type = val_type; - static constexpr bool is_signed = false; + static constexpr bool is_signed = false; static std::string name() { return "unsigned long"; } @@ -1761,7 +1764,7 @@ class ArithTraits { using val_type = long long; using mag_type = val_type; - static constexpr bool is_signed = true; + static constexpr bool is_signed = true; static std::string name() { return "long long"; } @@ -1774,7 +1777,7 @@ class ArithTraits { using val_type = unsigned long long; using mag_type = val_type; - static constexpr bool is_signed = false; + static constexpr bool is_signed = false; static std::string name() { return "unsigned long long"; } From 142577db1a748895761eb5daca4802974ae403c0 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 7 Jun 2022 10:04:33 -0600 Subject: [PATCH 176/261] common cleanup: applying clang-format --- perf_test/sparse/KokkosSparse_spadd.cpp | 12 ++++++------ src/common/KokkosKernels_IOUtils.hpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 877b3c5df1..5a273e6694 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -118,8 +118,8 @@ void run_experiment(const Params& params) { } else { std::cout << "Randomly generating A\n"; size_type nnzUnused = m * params.nnzPerRow; - A = KokkosSparse::Impl::kk_generate_sparse_matrix( - m, n, nnzUnused, 0, (n + 3) / 3); + A = KokkosSparse::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, + 0, (n + 3) / 3); } if (params.bmtx.length()) { std::cout << "Loading B from " << params.bmtx << '\n'; @@ -154,8 +154,8 @@ void run_experiment(const Params& params) { } else { std::cout << "Randomly generating B\n"; size_type nnzUnused = m * params.nnzPerRow; - B = KokkosSparse::Impl::kk_generate_sparse_matrix( - m, n, nnzUnused, 0, (n + 3) / 3); + B = KokkosSparse::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, + 0, (n + 3) / 3); } // Make sure dimensions are compatible if (A.numRows() != B.numRows() || A.numCols() != B.numCols()) { @@ -363,8 +363,8 @@ void run_experiment(const Params& params) { std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx << "\n"; crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC); - KokkosSparse::Impl::write_kokkos_crst_matrix( - C, params.cmtx.c_str()); + KokkosSparse::Impl::write_kokkos_crst_matrix(C, + params.cmtx.c_str()); } } diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index 42f31af65a..08e6f3cdc7 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -269,9 +269,9 @@ inline void kk_read_3Dview_from_file(idx_array_type &view, } template -[[deprecated]] -void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, - const wt *ew, const char *filename) { +[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, + const idx *edge_ends, const wt *ew, + const char *filename) { std::ofstream myFile(filename, std::ios::out | std::ios::binary); myFile.write((char *)&ne, sizeof(idx)); myFile.write((char *)edge_begins, sizeof(idx) * (ne)); From c56e4ab7a15da5294da0f0674b509d251c7db1b1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 7 Jun 2022 17:39:52 -0600 Subject: [PATCH 177/261] Common Utils: removing dependency on Sparse Utils in Common Utils Fixing some headers dependency to remove unnecessary dependency between Common and Sparse Utils. --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 1 + src/common/KokkosKernels_Utils.hpp | 1 - src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp | 1 + src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 1 + test_common/KokkosKernels_TestUtils.hpp | 1 + 5 files changed, 4 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index a82ece030b..a0d127595c 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -45,6 +45,7 @@ #include "KokkosBlas2_gemv.hpp" #include #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_IOUtils.hpp" struct Params { int use_cuda = 0; diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index a6649f102b..bf881edc6f 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -49,7 +49,6 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_PrintUtils.hpp" #include "KokkosKernels_VectorUtils.hpp" diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 1628b715a8..041a2f861b 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -48,6 +48,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Bitset.hpp" #include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_Utils.hpp" #include namespace KokkosGraph { diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 62b86ca72e..abedbe80ed 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -46,6 +46,7 @@ #define _KOKKOSGSIMP_HPP #include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_Utils.hpp" #include #include #include "KokkosGraph_Distance1Color.hpp" diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index a3a1ebf964..e7296b45a7 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -48,6 +48,7 @@ #include #include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" #include "Kokkos_ArithTraits.hpp" #include "KokkosSparse_spmv.hpp" // Make this include-able from all subdirectories From 0c9c8a3fc4004c35413fb86db0af4e439d4e2a11 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 8 Jun 2022 09:25:31 -0600 Subject: [PATCH 178/261] ArithTraits: adding back nan() for integral types, see issue #1437 This implementation is honestly very debatable, using -1 for signed integeral types may lead to very surprising results... --- src/common/Kokkos_ArithTraits.hpp | 44 ++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 1246dd0ed3..46528e8a89 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -418,10 +418,16 @@ namespace Details { return Kokkos::abs(x); \ } -#define KOKKOSKERNELS_UNSIGNED_ABS \ +#define KOKKOSKERNELS_UNSIGNED_ABS \ static KOKKOS_FUNCTION mag_type abs(const val_type x) { return x; } -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS) \ +#define KOKKOSKERNELS_SIGNED_NAN \ + static KOKKOS_FUNCTION val_type nan() { return -1; } + +#define KOKKOSKERNELS_UNSIGNED_NAN \ + static KOKKOS_FUNCTION val_type nan() { return max(); } + +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS, KOKKOSKERNELS_NAN) \ \ static constexpr bool is_specialized = true; \ static constexpr bool is_integer = true; \ @@ -449,6 +455,7 @@ namespace Details { static KOKKOS_FUNCTION val_type infinity() { \ return static_cast(0); \ } \ + KOKKOSKERNELS_NAN \ static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ KOKKOSKERNELS_ABS \ @@ -1651,7 +1658,8 @@ class ArithTraits { static std::string name() { return "char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1664,7 +1672,8 @@ class ArithTraits { static std::string name() { return "signed char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1677,7 +1686,8 @@ class ArithTraits { static std::string name() { return "unsigned char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, + KOKKOSKERNELS_UNSIGNED_NAN) }; template <> @@ -1690,7 +1700,8 @@ class ArithTraits { static std::string name() { return "short"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1703,7 +1714,8 @@ class ArithTraits { static std::string name() { return "unsigned short"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, + KOKKOSKERNELS_UNSIGNED_NAN) }; template <> @@ -1716,7 +1728,8 @@ class ArithTraits { static std::string name() { return "int"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1729,7 +1742,8 @@ class ArithTraits { static std::string name() { return "unsigned int"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, + KOKKOSKERNELS_UNSIGNED_NAN) }; template <> @@ -1742,7 +1756,8 @@ class ArithTraits { static std::string name() { return "long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1755,7 +1770,8 @@ class ArithTraits { static std::string name() { return "unsigned long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, + KOKKOSKERNELS_UNSIGNED_NAN) }; template <> @@ -1768,7 +1784,8 @@ class ArithTraits { static std::string name() { return "long long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, + KOKKOSKERNELS_SIGNED_NAN) }; template <> @@ -1781,7 +1798,8 @@ class ArithTraits { static std::string name() { return "unsigned long long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, + KOKKOSKERNELS_UNSIGNED_NAN) }; // dd_real and qd_real are floating-point types provided by the QD From 58f18ca5a6a20792f40b453eca7aa306ed54207e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 8 Jun 2022 09:20:50 -0600 Subject: [PATCH 179/261] ArithTraits: applying clang-format --- src/common/Kokkos_ArithTraits.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 46528e8a89..d6271f9b4e 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -418,16 +418,17 @@ namespace Details { return Kokkos::abs(x); \ } -#define KOKKOSKERNELS_UNSIGNED_ABS \ +#define KOKKOSKERNELS_UNSIGNED_ABS \ static KOKKOS_FUNCTION mag_type abs(const val_type x) { return x; } -#define KOKKOSKERNELS_SIGNED_NAN \ +#define KOKKOSKERNELS_SIGNED_NAN \ static KOKKOS_FUNCTION val_type nan() { return -1; } -#define KOKKOSKERNELS_UNSIGNED_NAN \ +#define KOKKOSKERNELS_UNSIGNED_NAN \ static KOKKOS_FUNCTION val_type nan() { return max(); } -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS, KOKKOSKERNELS_NAN) \ +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS, \ + KOKKOSKERNELS_NAN) \ \ static constexpr bool is_specialized = true; \ static constexpr bool is_integer = true; \ From a70474ce0030a079edefe17da9155625b869fb87 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 8 Jun 2022 11:39:49 -0600 Subject: [PATCH 180/261] Test clean-up: removing unnecessary include from KokkosBlas2_gemv_perf_test.cpp --- perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index a0d127595c..a82ece030b 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -45,7 +45,6 @@ #include "KokkosBlas2_gemv.hpp" #include #include "KokkosKernels_TestUtils.hpp" -#include "KokkosKernels_IOUtils.hpp" struct Params { int use_cuda = 0; From 9ee8783906576ea643f6aa1685e145fb781f32e3 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 10 Jun 2022 14:29:19 -0600 Subject: [PATCH 181/261] Add template params to forwarding calls in deprecated KokkosKernels::sort_crs* Address #1440 --- src/sparse/KokkosSparse_SortCrs.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp index 97bad80f39..c1b28097f1 100644 --- a/src/sparse/KokkosSparse_SortCrs.hpp +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -637,7 +637,7 @@ template (rowmap, entries, values); } template @@ -648,7 +648,7 @@ template template [[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { - KokkosSparse::sort_crs_graph(rowmap, entries); + KokkosSparse::sort_crs_graph(rowmap, entries); } template From 77bb9c3fc9b60abf6396e504a2c81786e69b03a1 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 10 Jun 2022 14:46:56 -0600 Subject: [PATCH 182/261] apply clang-format --- src/sparse/KokkosSparse_SortCrs.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp index c1b28097f1..68de6b5f7c 100644 --- a/src/sparse/KokkosSparse_SortCrs.hpp +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -637,7 +637,8 @@ template (rowmap, entries, values); + KokkosSparse::sort_crs_matrix( + rowmap, entries, values); } template @@ -648,7 +649,8 @@ template template [[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { - KokkosSparse::sort_crs_graph(rowmap, entries); + KokkosSparse::sort_crs_graph(rowmap, + entries); } template From 84cdaaef953ec18dc3e4569fd1576c7010c2679c Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Wed, 25 May 2022 16:52:22 -0600 Subject: [PATCH 183/261] cusparseSpMM for CrsMatrix multivector product * Y is LayoutLeft * X is LayoutLeft or LayoutRight * Scalars are fp64, fp32, or fp16 * Index/Offset types are int only This TPL will be used if available, set algorithm=native in controls to disable. Require CUSPARSE_VERSION >= 10301 because cusparseSpMM produces incorrect results for non-transpose operations before that. The required cuSparse ships with CUDA 10.2.89. --- src/batched/KokkosBatched_Util.hpp | 3 +- .../KokkosSparse_spmv_mv_tpl_spec_avail.hpp | 175 +++++++++ .../KokkosSparse_spmv_mv_tpl_spec_decl.hpp | 336 ++++++++++++++++++ .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 11 +- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 28 +- src/sparse/KokkosSparse_Utils_cusparse.hpp | 77 ++++ src/sparse/KokkosSparse_spmv.hpp | 67 +++- src/sparse/impl/KokkosSparse_spmv_spec.hpp | 13 +- 8 files changed, 657 insertions(+), 53 deletions(-) create mode 100644 src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp create mode 100644 src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index cdb3c55d3c..338c3fe8f8 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -854,10 +854,9 @@ KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, template KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType alpha, + ScalarType /*alpha*/, const AlphaTag::No &) { return reg_c; - (void)alpha; } template ::type>::value> +struct spmv_mv_tpl_spec_avail { + enum : bool { value = false }; +}; + +#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ + XL, YL, MEMSPACE) \ + template <> \ + struct spmv_mv_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, YL, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +/* CUSPARSE_VERSION 10300 and lower seem to have a bug in cusparseSpMM +non-transpose that produces incorrect result. This is cusparse distributed with +CUDA 10.1.243. The bug seems to be resolved by CUSPARSE 10301 (present by +CUDA 10.2.89) */ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +#endif +#endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_ diff --git a/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp new file mode 100644 index 0000000000..0bfeec3288 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -0,0 +1,336 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ + +#include "KokkosKernels_Controls.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +/* CUSPARSE_VERSION < 10301 either doesn't have cusparseSpMM + or the non-tranpose version produces incorrect results. +*/ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#include "cusparse.h" +#include "KokkosSparse_Utils_cusparse.hpp" + +namespace KokkosSparse { +namespace Impl { + +/* Derive a compute type for various operand types. + cusparseSpMM does not always allow the same compute type as operand types + This should be consistent with the allowed operand types for cusparseSpMM, + as needed for TPL availability. Current definition does not comprehensively + cover all cusparseSpMM options. + + cuSparse 11.5.1+ does not support uniform precision for FP16 + Otherwise, uniform precision is supported +*/ +template +cudaDataType compute_type() { + return cuda_data_type_from(); +} +#if CUSPARSE_VERSION >= 11501 +template <> +inline cudaDataType compute_type() { + return CUDA_R_32F; +} +#else +template <> +inline cudaDataType compute_type() { + return cuda_data_type_from(); +} +#endif + +/*! \brief convert a 2D view to a cusparseDnMatDescr_t + +*/ +template = true> +cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { + const int64_t rows = view.extent(0); + const int64_t cols = view.extent(1); + const int64_t ld = view.extent(0); + + // cusparseCreateCsr notes it is safe to const_cast this away for input + // pointers to a descriptor as long as that descriptor is not an output + // parameter + void *values = + const_cast(view.data()); + + cudaDataType valueType = + cuda_data_type_from(); + + // col-major is the only supported order in 10301 + // ignore the layout of the provided view, and expect the caller to + // fix with a transpose operation, if possible. + // This should be revisited once cusparse supports row-major dense matrices + const cusparseOrder_t order = CUSPARSE_ORDER_COL; + + cusparseDnMatDescr_t descr; + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnMat(&descr, rows, cols, ld, values, valueType, order)); + + return descr; +} + +template +void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, + const char mode[], + typename YVector::non_const_value_type const &alpha, + const AMatrix &A, const XVector &x, + typename YVector::non_const_value_type const &beta, + const YVector &y) { + static_assert(XVector::rank == 2, + "should only be instantiated for multivector"); + static_assert(YVector::rank == 2, + "should only be instantiated for multivector"); + + using offset_type = typename AMatrix::non_const_size_type; + using entry_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using x_value_type = typename XVector::non_const_value_type; + using y_value_type = typename YVector::non_const_value_type; + + /* initialize cusparse library */ + cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + + /* Set the operation mode */ + cusparseOperation_t opA; + switch (toupper(mode[0])) { + case 'N': opA = CUSPARSE_OPERATION_NON_TRANSPOSE; break; + case 'T': opA = CUSPARSE_OPERATION_TRANSPOSE; break; + case 'H': opA = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; + default: { + std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; + throw std::invalid_argument("Invalid mode"); + } + } + + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + const cudaDataType aCusparseType = cuda_data_type_from(); + + /* create matrix */ + cusparseSpMatDescr_t A_cusparse; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &A_cusparse, A.numRows(), A.numCols(), A.nnz(), + (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), + (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); + + /* create lhs and rhs + NOTE: The descriptions always say vecX and vecY are column-major cusparse + order. For CUSPARSE_VERSION 10301 this is the only supported ordering. if X + is not LayoutLeft, we can fix with a transpose. If cusparseSpMM ever + supports row-major dense matrices, this logic will have to be reworked */ + constexpr bool xIsLL = + std::is_same::value; + constexpr bool xIsLR = + std::is_same::value; + static_assert(xIsLL || xIsLR, "X multivector was not LL or LR (TPL error)"); + cusparseDnMatDescr_t vecX = make_cusparse_dn_mat_descr_t(x); + cusparseDnMatDescr_t vecY = make_cusparse_dn_mat_descr_t(y); + cusparseOperation_t opB = + xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; + + const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; + + // the precision of the SpMV + const cudaDataType computeType = + compute_type(); + + size_t bufferSize = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( + cusparseHandle, opA, opB, &alpha, A_cusparse, vecX, &beta, vecY, + computeType, alg, &bufferSize)); + + void *dBuffer = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha, + A_cusparse, vecX, &beta, vecY, + computeType, alg, dBuffer)); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecX)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); +} + +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const **, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR **, YL, Kokkos::Device, \ + Kokkos::MemoryTraits, false, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const **, XL, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + using Controls = KokkosKernels::Experimental::Controls; \ + static void spmv_mv(const Controls &controls, const char mode[], \ + const coefficient_type &alpha, const AMatrix &A, \ + const XVector &x, const coefficient_type &beta, \ + const YVector &y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_cusparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +/* cusparseSpMM with following restrictions + column-major ordering for Y + col-major or row-major for X (see note below) + 32-bit indices for matrix A */ +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +#endif + +#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE + +} // namespace Impl +} // namespace KokkosSparse +#endif // defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#endif // KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ \ No newline at end of file diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index fd42797d71..a91996361b 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -201,6 +201,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #endif // CUDA/CUSPARSE >= 9.0? #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#undef KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE + #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ @@ -265,15 +267,6 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif // KOKKOSKERNELS_ENABLE_TPL_MKL -// Specialization struct which defines whether a specialization exists -template ::type>::value> -struct spmv_mv_tpl_spec_avail { - enum : bool { value = false }; -}; - } // namespace Impl } // namespace KokkosSparse diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 0a92b91eb2..868d8ec047 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -86,25 +86,11 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - cusparseIndexType_t myCusparseOffsetType; - if (std::is_same::value) - myCusparseOffsetType = CUSPARSE_INDEX_32I; - else if (std::is_same::value || - std::is_same::value) - myCusparseOffsetType = CUSPARSE_INDEX_64I; - else - throw std::logic_error( - "Offset type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer " - "says it is"); - cusparseIndexType_t myCusparseEntryType; - if (std::is_same::value) - myCusparseEntryType = CUSPARSE_INDEX_32I; - else if (std::is_same::value) - myCusparseEntryType = CUSPARSE_INDEX_64I; - else - throw std::logic_error( - "Ordinal (entry) type of CrsMatrix isn't supported by cuSPARSE, yet " - "TPL layer says it is"); + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + cudaDataType myCudaDataType; if (std::is_same::value) myCudaDataType = CUDA_R_32F; @@ -373,8 +359,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif -#endif +#endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_CUSPARSE diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp index ea9bfd37dd..4c3ec96555 100644 --- a/src/sparse/KokkosSparse_Utils_cusparse.hpp +++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp @@ -114,6 +114,83 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus, KokkosSparse::Impl::cusparse_internal_safe_call(call, #call, __FILE__, \ __LINE__) +template +cudaDataType cuda_data_type_from() { + // compile-time failure with a nice message if called on an unsupported type + static_assert(!std::is_same::value, + "cuSparse TPL does not support scalar type"); + // static_assert(false, ...) is allowed to error even if the code is not + // instantiated. obfuscate the predicate Despite this function being + // uncompilable, the compiler may decide that a return statement is missing, + // so throw to silence that + throw std::logic_error("unreachable throw after static_assert"); +} + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_16F; // Kokkos half_t is a half +} +#else +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_32F; // Kokkos half_t is a float +} +#endif +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_32F; +} +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_64F; +} +template <> +inline cudaDataType cuda_data_type_from>() { + return CUDA_C_32F; +} +template <> +inline cudaDataType cuda_data_type_from>() { + return CUDA_C_32F; +} + +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + +template +cusparseIndexType_t cusparse_index_type_t_from() { +#define AS_STR_LITERAL_IMPL_(x) #x +#define AS_STR_LITERAL(x) AS_STR_LITERAL_IMPL_(x) + static_assert(!std::is_same::value, + "cuSparse " AS_STR_LITERAL( + CUSPARSE_VERSION) " TPL does not support index type"); + // static_assert(false, ...) is allowed to error even if the code is not + // instantiated. obfuscate the predicate Despite this function being + // uncompilable, the compiler may decide that a return statement is missing, + // so throw to silence that + throw std::logic_error("unreachable throw after static_assert"); +#undef AS_STR_LITERAL_IMPL_ +#undef AS_STR_LITERAL +} + +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_32I; +} +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_64I; +} +// Currently no CUSPARSE_INDEX_64U but this will work most of the time +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_64I; +} +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_16U; +} +#endif + } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 972bbc74ad..95860029f1 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -662,9 +662,10 @@ template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, const RANK_TWO) { + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -752,21 +753,50 @@ void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], XVector_Internal x_i = x; YVector_Internal y_i = y; - return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i, - x_i, beta, y_i); + bool useNative = false; + +// cusparseSpMM does not support conjugate mode +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + useNative = useNative || (Conjugate[0] == mode[0]); +#endif + useNative = useNative || (controls.isParameter("algorithm") && + (controls.getParameter("algorithm") == "native")); + + if (useNative) { + return Impl::SPMV_MV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits, + std::is_integral::value, + false>::spmv_mv(controls, mode, alpha, A_i, x_i, beta, y_i); + } else { + return Impl::SPMV_MV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits>::spmv_mv(controls, mode, + alpha, A_i, x_i, + beta, y_i); + } } } @@ -1531,8 +1561,9 @@ void spmv_struct(const char mode[], const int stencil_type, typename YVector_Internal::value_type**, typename YVector_Internal::array_layout, typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i, - x_i, beta, y_i); + typename YVector_Internal::memory_traits>:: + spmv_mv(KokkosKernels::Experimental::Controls(), mode, alpha, A_i, x_i, + beta, y_i); } } diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp index e0fdb2b6cd..cc29d72b77 100644 --- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -111,6 +111,8 @@ struct spmv_mv_eti_spec_avail { // Include the actual specialization declarations #include #include + +#include #include namespace KokkosSparse { @@ -204,7 +206,8 @@ struct SPMV_MV { typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& controls, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); }; @@ -261,7 +264,8 @@ struct SPMV_MV YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::Details::ArithTraits KAT; @@ -287,7 +291,8 @@ struct SPMV_MV YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { static_assert(std::is_integral::value, @@ -377,6 +382,8 @@ struct SPMV_MV #include + +#include #include #endif // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_ From 630fb844e2b8f312298bd8ed67e6d31c95024b46 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 23 Jun 2022 18:51:03 -0600 Subject: [PATCH 184/261] csc2csr: update Kokkos_Numeric.hpp header inclusion Update std_algorithms header include to match renaming/reorg of numeric headers in kokkos/kokkos#5113 --- src/sparse/KokkosSparse_csc2csr.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 49f84f15da..83a96c3c02 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -44,7 +44,13 @@ #include "KokkosKernels_Utils.hpp" #include -#include +#include +#include +#include +#include +#include +#include +#include #ifndef _KOKKOSSPARSE_CSC2CSR_HPP #define _KOKKOSSPARSE_CSC2CSR_HPP @@ -248,4 +254,4 @@ auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, return csc2Csr.get_csrMat(); } } // namespace KokkosSparse -#endif // _KOKKOSSPARSE_CSC2CSR_HPP \ No newline at end of file +#endif // _KOKKOSSPARSE_CSC2CSR_HPP From caa6a3c2754e13ad574bc7a320993d9fcd936426 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 24 Jun 2022 11:44:49 -0600 Subject: [PATCH 185/261] docs: Added requirements.txt and promotion.txt --- {doc => docs}/kokkos-promotion.txt | 0 docs/requirements.txt | 1 + 2 files changed, 1 insertion(+) rename {doc => docs}/kokkos-promotion.txt (100%) create mode 100644 docs/requirements.txt diff --git a/doc/kokkos-promotion.txt b/docs/kokkos-promotion.txt similarity index 100% rename from doc/kokkos-promotion.txt rename to docs/kokkos-promotion.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000..188f51e62d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +breathe \ No newline at end of file From b7a5bf96d253d372d66837acca4d441871922d96 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 24 Jun 2022 12:03:27 -0600 Subject: [PATCH 186/261] README.md: Add documentation badge --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 08f80c19d6..58127b912e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Generic badge](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://kokkos-kernels.readthedocs.io/en/latest/) + ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) # Kokkos Kernels From d7aa31070270d8fe784ceeda91244f2deaae401f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 24 Jun 2022 12:11:34 -0600 Subject: [PATCH 187/261] docs/index.rst: Under Construction --- docs/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.rst b/docs/index.rst index e0c5ea9a98..db873e9a3b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,4 +1,4 @@ -Kokkos Kernels documentation +Kokkos Kernels documentation: Under Construction ========================================== .. toctree:: :maxdepth: 2 From 7b606264e1e768c414a0ac0838bff210c1bef646 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 28 Jun 2022 13:03:13 -0600 Subject: [PATCH 188/261] dot perf test: adding support for HIP and SYCL backend Not much needed to change, the device argument is currently ignored. --- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 9219d34810..9b36afca8f 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -54,6 +54,8 @@ struct Params { int use_cuda = 0; int use_openmp = 0; int use_threads = 0; + int use_hip = 0; + int use_sycl = 0; // m is vector length int m = 100000; int repeat = 1; @@ -63,7 +65,8 @@ void print_options() { std::cerr << "Options:\n" << std::endl; std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | " - "'--cuda [cudaDeviceIndex]'" + "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | " + "'--sycl [syclDeviceIndex]'" << std::endl; std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl; std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " @@ -86,6 +89,10 @@ int parse_inputs(Params& params, int argc, char** argv) { params.use_openmp = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { params.m = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { @@ -193,6 +200,8 @@ int main(int argc, char** argv) { bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; + bool useHIP = params.use_hip != 0; + bool useSYCL = params.use_sycl != 0; bool useSerial = !useThreads && !useOMP && !useCUDA; if (useThreads) { @@ -221,6 +230,25 @@ int main(int argc, char** argv) { return 1; #endif } + + if (useHIP) { +#if defined(KOKKOS_ENABLE_HIP) + run(params.m, params.repeat); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params.m, params.repeat); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + if (useSerial) { #if defined(KOKKOS_ENABLE_SERIAL) run(params.m, params.repeat); From 70f6a4a5ec6dc1d42c85c3fa5032215d17f04412 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 28 Jun 2022 13:14:26 -0600 Subject: [PATCH 189/261] dot perf test: adding sycl logic for multivector case Adding a bit of logic to dot_mv_perf_test so we can test it with the sycl backend. Also fixing a couple issues in the dot_perf_test regarding the logic to select the appropriate device to run on. --- .../blas1/KokkosBlas_dot_mv_perf_test.cpp | 21 +++++++++++++++---- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 4 ++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 49032307c4..d873b503d8 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -50,6 +50,7 @@ struct Params { int use_cuda = 0; int use_hip = 0; + int use_sycl = 0; int use_openmp = 0; int use_threads = 0; // m is vector length @@ -63,7 +64,8 @@ void print_options() { std::cerr << "Options:\n" << std::endl; std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | " - "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" + "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | " + "'--sycl [syclDeviceIndex]'" << std::endl; std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl; std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " @@ -89,7 +91,9 @@ int parse_inputs(Params& params, int argc, char** argv) { } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { params.use_cuda = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(argv[++i]) + 1; + params.use_hip = atoi(argv[++i]) + 1;; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { params.m = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { @@ -190,7 +194,7 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = std::max(params.use_cuda, params.use_hip) - 1; + const int device_id = std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); @@ -200,7 +204,8 @@ int main(int argc, char** argv) { bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; bool useHIP = params.use_hip != 0; - bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL; if (useThreads) { #if defined(KOKKOS_ENABLE_THREADS) @@ -234,6 +239,14 @@ int main(int argc, char** argv) { #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; +#endif + } + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params.m, params.n, params.repeat); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; #endif } if (useSerial) { diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 9b36afca8f..33833b86a9 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -191,7 +191,7 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = params.use_cuda - 1; + const int device_id = std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); @@ -202,7 +202,7 @@ int main(int argc, char** argv) { bool useCUDA = params.use_cuda != 0; bool useHIP = params.use_hip != 0; bool useSYCL = params.use_sycl != 0; - bool useSerial = !useThreads && !useOMP && !useCUDA; + bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL; if (useThreads) { #if defined(KOKKOS_ENABLE_THREADS) From 9474177ad2241d8615ae5f038eb3286903484602 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 28 Jun 2022 13:02:06 -0600 Subject: [PATCH 190/261] dot perf test: applying clang-format --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp | 6 ++++-- perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index d873b503d8..7690e0e653 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -91,7 +91,8 @@ int parse_inputs(Params& params, int argc, char** argv) { } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { params.use_cuda = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(argv[++i]) + 1;; + params.use_hip = atoi(argv[++i]) + 1; + ; } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { @@ -194,7 +195,8 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; + const int device_id = + std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 33833b86a9..a2ca69e0c1 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -191,7 +191,8 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; + const int device_id = + std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); From a14302baf218f58c22951410c94df06bc4eb346b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 28 Jun 2022 13:21:19 -0600 Subject: [PATCH 191/261] dot perf test: fixing small typo --- perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 7690e0e653..a57b534f32 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -92,7 +92,6 @@ int parse_inputs(Params& params, int argc, char** argv) { params.use_cuda = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { params.use_hip = atoi(argv[++i]) + 1; - ; } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { From 41189d4446fe88e6b00bdac7f82739f26e059e4a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 28 Jun 2022 15:02:27 -0600 Subject: [PATCH 192/261] dot perf test: updating throw string with correct backend. --- perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index a2ca69e0c1..a46f4d6b20 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -236,7 +236,7 @@ int main(int argc, char** argv) { #if defined(KOKKOS_ENABLE_HIP) run(params.m, params.repeat); #else - std::cout << "ERROR: CUDA requested, but not available.\n"; + std::cout << "ERROR: HIP requested, but not available.\n"; return 1; #endif } @@ -245,7 +245,7 @@ int main(int argc, char** argv) { #if defined(KOKKOS_ENABLE_SYCL) run(params.m, params.repeat); #else - std::cout << "ERROR: CUDA requested, but not available.\n"; + std::cout << "ERROR: SYCL requested, but not available.\n"; return 1; #endif } From c4bc6c38609495913d8ad43ee4c13b859e78861b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 23 Jun 2022 13:20:43 +0200 Subject: [PATCH 193/261] move SerialScale to KokkosBlas --- .../dense/KokkosBatched_Scale_Decl.hpp | 5 +- .../KokkosBatched_Gemm_Serial_Internal.hpp | 6 +- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 1 - .../KokkosBatched_Gemv_Serial_Internal.hpp | 6 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 1 - .../dense/impl/KokkosBatched_Scale_Impl.hpp | 10 --- .../impl/KokkosBatched_Scale_Internal.hpp | 32 +------ ...kosBatched_ShiftedTrsv_Serial_Internal.hpp | 1 - .../KokkosBatched_Trmm_Serial_Internal.hpp | 14 +-- .../KokkosBatched_Trsm_Serial_Internal.hpp | 14 +-- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 1 - .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 1 - .../KokkosBatched_Trsv_Serial_Internal.hpp | 14 +-- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 1 - .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 1 - .../KokkosBatched_Trtri_Serial_Internal.hpp | 8 +- src/blas/KokkosBlas1_serial_scal.hpp | 67 +++++++++++++++ .../impl/KokkosBlas1_serial_scal_impl.hpp | 86 +++++++++++++++++++ src/blas/impl/KokkosBlas3_trmm_impl.hpp | 1 - src/blas/impl/KokkosBlas3_trsm_impl.hpp | 4 +- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 1 - .../dense/Test_Batched_SerialMatUtil.hpp | 7 +- 22 files changed, 202 insertions(+), 80 deletions(-) create mode 100644 src/blas/KokkosBlas1_serial_scal.hpp create mode 100644 src/blas/impl/KokkosBlas1_serial_scal_impl.hpp diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index f3ea9b0aab..baf301466d 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -15,7 +15,10 @@ namespace KokkosBatched { struct SerialScale { template KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + assert(false && "Deprecated: use KokkosBlas::SerialScale"); + return 0; + } }; /// diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index f2b009fe2f..11d0481a9d 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -43,7 +43,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( if (beta == zero) SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) - SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); if (alpha != zero) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -83,7 +83,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( if (beta == zero) SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) - SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); if (alpha != zero) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index b0c1f9c1ae..8c8e913f01 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" namespace KokkosBatched { diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp index fbd4a1e2d3..59f404dd92 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" @@ -41,7 +41,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( if (beta == zero) SerialSetInternal ::invoke(m, zero, y, ys0); else if (beta != one) - SerialScaleInternal::invoke(m, beta, y, ys0); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -80,7 +80,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( if (beta == zero) SerialSetInternal ::invoke(m, zero, y, ys0); else if (beta != one) - SerialScaleInternal::invoke(m, beta, y, ys0); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index cc3f6d27ff..efc08144d2 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp index b4e865ddea..4b0ed29bb9 100644 --- a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp @@ -8,16 +8,6 @@ namespace KokkosBatched { -/// -/// Serial Impl -/// =========== -template -KOKKOS_INLINE_FUNCTION int SerialScale::invoke(const ScalarType alpha, - const AViewType &A) { - return SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); -} - /// /// Team Impl /// ========= diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp index 6f313ea919..f02d295267 100644 --- a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp @@ -4,39 +4,10 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" namespace KokkosBatched { -/// -/// Serial Internal Impl -/// ==================== -struct SerialScaleInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (int i = 0; i < m; ++i) A[i * as0] *= alpha; - - return 0; - } - - template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { - if (as0 > as1) - for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); - else - for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0); - - return 0; - } -}; - /// /// Team Internal Impl /// ==================== @@ -58,6 +29,7 @@ struct TeamScaleInternal { const ScalarType alpha, /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { + using KokkosBlas::Impl::SerialScaleInternal; if (m > n) { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, m), [&](const int &i) { diff --git a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index b0e2ea5b80..5fdfffe68f 100644 --- a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index 9b5cc055e3..b97a6c17c2 100644 --- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -48,7 +48,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" namespace KokkosBatched { @@ -154,7 +154,8 @@ SerialTrmmInternalLeftLower::invoke( if (alpha == zero) SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -242,7 +243,8 @@ SerialTrmmInternalRightLower::invoke( if (alpha == zero) SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -323,7 +325,8 @@ SerialTrmmInternalLeftUpper::invoke( if (alpha == zero) SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -403,7 +406,8 @@ SerialTrmmInternalRightUpper::invoke( if (alpha == zero) SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index b317bed4f7..409a17ddf3 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixA_Serial_Impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" @@ -41,7 +41,8 @@ SerialTrsmInternalLeftLower::invoke( if (alpha == zero) SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -89,7 +90,8 @@ SerialTrsmInternalLeftLower::invoke( if (alpha == zero) SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); @@ -156,7 +158,8 @@ SerialTrsmInternalLeftUpper::invoke( if (alpha == zero) SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -204,7 +207,8 @@ SerialTrsmInternalLeftUpper::invoke( if (alpha == zero) SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 0afa92ae6e..8308200f12 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" namespace KokkosBatched { diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index 37e5051675..5baac85374 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -7,7 +7,6 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index fb28ea5a9c..384c183f90 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" @@ -44,7 +44,8 @@ SerialTrsvInternalLower::invoke( if (alpha == zero) SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -81,7 +82,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( if (alpha == zero) SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -137,7 +139,8 @@ SerialTrsvInternalUpper::invoke( if (alpha == zero) SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; @@ -172,7 +175,8 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( if (alpha == zero) SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index ad50e6fc2a..baca8bad13 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" namespace KokkosBatched { diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index 60b941e1ba..f1f6faed8c 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -6,7 +6,6 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index ee14040aed..8c8af6cbd5 100644 --- a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -108,8 +108,8 @@ SerialTrtriInternalLower::invoke( // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, - as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, + A_ii, A_col_vec, as0, as1); } } return 0; @@ -157,8 +157,8 @@ SerialTrtriInternalUpper::invoke( // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, - as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, + A_ii, A_col_vec, as0, as1); } } return 0; diff --git a/src/blas/KokkosBlas1_serial_scal.hpp b/src/blas/KokkosBlas1_serial_scal.hpp new file mode 100644 index 0000000000..eacbda3079 --- /dev/null +++ b/src/blas/KokkosBlas1_serial_scal.hpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS1_SERIAL_SCAL_HPP_ +#define KOKKOSBLAS1_SERIAL_SCAL_HPP_ + +#include + +namespace KokkosBlas { + +/// +/// Serial Scale +/// + +struct SerialScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A) { + return KokkosBlas::Impl::SerialScaleInternal::invoke( + A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + } +}; + +} // namespace KokkosBlas + +#endif diff --git a/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp new file mode 100644 index 0000000000..bb411ef4a5 --- /dev/null +++ b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp @@ -0,0 +1,86 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_ +#define KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +struct SerialScaleInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, + const int as0) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) A[i * as0] *= alpha; + + return 0; + } + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, + const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { + if (as0 > as1) + for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); + else + for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0); + + return 0; + } +}; + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/src/blas/impl/KokkosBlas3_trmm_impl.hpp b/src/blas/impl/KokkosBlas3_trmm_impl.hpp index 56bc2ba806..ee3e3a085d 100644 --- a/src/blas/impl/KokkosBlas3_trmm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trmm_impl.hpp @@ -54,7 +54,6 @@ #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_Trmm_Decl.hpp" #include "KokkosBatched_Trmm_Serial_Impl.hpp" diff --git a/src/blas/impl/KokkosBlas3_trsm_impl.hpp b/src/blas/impl/KokkosBlas3_trsm_impl.hpp index b215633093..4832a74719 100644 --- a/src/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -75,7 +75,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) - KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -114,7 +114,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) - KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType* KOKKOS_RESTRICT B0 = B; diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index cc8551638f..888c168191 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -542,7 +542,6 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosBatched_Gemv_TeamVector_Internal.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" #include "KokkosBatched_Gemm_TeamVector_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp index f9a58f5442..76d6e5a381 100644 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp @@ -7,8 +7,9 @@ #include "KokkosBatched_Set_Decl.hpp" #include "KokkosBatched_Set_Impl.hpp" -#include "KokkosBatched_Scale_Decl.hpp" -#include "KokkosBatched_Scale_Impl.hpp" +// TODO: move this test to KokkosBlas when both SerialScale and SerialSet are +// moved +#include "KokkosBlas1_serial_scal.hpp" // #include "KokkosBatched_Scale_Decl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -36,7 +37,7 @@ struct Functor_TestBatchedSerialMatUtil { auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); switch (TestID) { case BatchedSet: SerialSet ::invoke(_alpha, A); break; - case BatchedScale: SerialScale::invoke(_alpha, A); break; + case BatchedScale: KokkosBlas::SerialScale::invoke(_alpha, A); break; } } From 2af0d2a11dfaf9a8d75183917189be89960ac084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 23 Jun 2022 14:01:27 +0200 Subject: [PATCH 194/261] move TeamScale and TeamVectorScale to KokkosBlas --- .../dense/KokkosBatched_Scale_Decl.hpp | 16 +++--- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 6 +- .../impl/KokkosBatched_Gemm_Team_Internal.hpp | 8 ++- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 4 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 4 +- .../dense/impl/KokkosBatched_Scale_Impl.hpp | 38 ------------- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 6 +- .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 12 ++-- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 8 ++- .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 12 ++-- src/blas/KokkosBlas1_team_scal.hpp | 37 ++++++++++++ .../impl/KokkosBlas1_team_scal_impl.hpp} | 57 ++++++++++++++++--- .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 4 +- .../dense/Test_Batched_TeamMatUtil.hpp | 5 +- 14 files changed, 139 insertions(+), 78 deletions(-) delete mode 100644 src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp rename src/{batched/dense/impl/KokkosBatched_Scale_Internal.hpp => blas/impl/KokkosBlas1_team_scal_impl.hpp} (59%) diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index baf301466d..7b07bc06a3 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -3,9 +3,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Vector.hpp" - namespace KokkosBatched { /// @@ -30,7 +27,10 @@ struct TeamScale { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + assert(false && "Deprecated: use KokkosBlas::TeamScale"); + return 0; + } }; /// @@ -42,11 +42,13 @@ struct TeamVectorScale { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + // static_assert(false); + assert(false && "Deprecated: use KokkosBlas::TeamVectorScale"); + return 0; + } }; } // namespace KokkosBatched -#include "KokkosBatched_Scale_Impl.hpp" - #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 8c8e913f01..630fcf6c02 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -40,7 +40,8 @@ TeamVectorGemmInternal::invoke( if (beta == zero) TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, + cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -80,7 +81,8 @@ TeamVectorGemmInternal::invoke( if (beta == zero) TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, + cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 73d831586b..5825d0cb60 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -7,7 +7,7 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_team_scal.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -43,7 +43,8 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, + cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -84,7 +85,8 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( if (beta == zero) TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, + cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 419698a24e..6536a00eb7 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" @@ -60,7 +60,7 @@ TeamVectorGemvInternal::invoke( if (beta == zero) TeamVectorSetInternal ::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index efc08144d2..f8746e98b9 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -49,7 +49,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( if (beta == zero) TeamSetInternal ::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -88,7 +88,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( if (beta == zero) TeamSetInternal ::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp deleted file mode 100644 index 4b0ed29bb9..0000000000 --- a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp +++ /dev/null @@ -1,38 +0,0 @@ -#ifndef __KOKKOSBATCHED_SCALE_IMPL_HPP__ -#define __KOKKOSBATCHED_SCALE_IMPL_HPP__ - -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - -namespace KokkosBatched { - -/// -/// Team Impl -/// ========= - -template -template -KOKKOS_INLINE_FUNCTION int TeamScale::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} - -/// -/// TeamVector Impl -/// =============== - -template -template -KOKKOS_INLINE_FUNCTION int TeamVectorScale::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), - A.stride_1()); -} - -} // namespace KokkosBatched - -#endif diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 8308200f12..87d9a88122 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -37,7 +37,8 @@ TeamVectorTrsmInternalLeftLower::invoke( TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, + bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -98,7 +99,8 @@ TeamVectorTrsmInternalLeftUpper::invoke( TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, + bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index 5baac85374..407ed281db 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -41,7 +41,8 @@ TeamTrsmInternalLeftLower::invoke( TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -92,7 +93,8 @@ TeamTrsmInternalLeftLower::invoke( TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; /// @@ -175,7 +177,8 @@ TeamTrsmInternalLeftUpper::invoke( TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -231,7 +234,8 @@ TeamTrsmInternalLeftUpper::invoke( TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index baca8bad13..5b673b91b9 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -44,7 +44,9 @@ TeamVectorTrsvInternalLower::invoke( if (alpha == zero) TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, + bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -106,7 +108,9 @@ TeamVectorTrsvInternalUpper::invoke( if (alpha == zero) TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, + bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index f1f6faed8c..a71f71dd71 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -46,7 +46,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (alpha == zero) TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -91,7 +92,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( if (alpha == zero) TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -155,7 +157,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (alpha == zero) TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; @@ -198,7 +201,8 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( if (alpha == zero) TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); diff --git a/src/blas/KokkosBlas1_team_scal.hpp b/src/blas/KokkosBlas1_team_scal.hpp index 5fbe9688d1..af6c61f609 100644 --- a/src/blas/KokkosBlas1_team_scal.hpp +++ b/src/blas/KokkosBlas1_team_scal.hpp @@ -45,9 +45,46 @@ #ifndef KOKKOSBLAS1_TEAM_SCAL_HPP_ #define KOKKOSBLAS1_TEAM_SCAL_HPP_ +#include + +// TODO: deprecate/remove ? #include namespace KokkosBlas { + +/// +/// Team Scale +/// + +template +struct TeamScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +/// +/// TeamVector Scale +/// + +template +struct TeamVectorScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), + A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1()); + } +}; + +// TODO: deprecate/remove ? namespace Experimental { template diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp similarity index 59% rename from src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp rename to src/blas/impl/KokkosBlas1_team_scal_impl.hpp index f02d295267..6f4fdf40b0 100644 --- a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp +++ b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp @@ -1,12 +1,55 @@ -#ifndef __KOKKOSBATCHED_SCALE_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCALE_INTERNAL_HPP__ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ -/// \author Kyungjoo Kim (kyukim@sandia.gov) +#ifndef KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_ +#define KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_ -#include "KokkosBatched_Util.hpp" +#include #include "KokkosBlas1_serial_scal_impl.hpp" -namespace KokkosBatched { +namespace KokkosBlas { +namespace Impl { /// /// Team Internal Impl @@ -29,7 +72,6 @@ struct TeamScaleInternal { const ScalarType alpha, /* */ ValueType *KOKKOS_RESTRICT A, const int as0, const int as1) { - using KokkosBlas::Impl::SerialScaleInternal; if (m > n) { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, m), [&](const int &i) { @@ -87,6 +129,7 @@ struct TeamVectorScaleInternal { } }; -} // namespace KokkosBatched +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 888c168191..131924f418 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -638,7 +638,7 @@ struct BSR_GEMV_Functor { const y_value_type val_one = Kokkos::ArithTraits::one(); ; if (beta != val_one) { - KokkosBatched::TeamVectorScaleInternal::invoke( + KokkosBlas::Impl::TeamVectorScaleInternal::invoke( dev, block_dim, beta, Y_cur.data(), static_cast(Y_cur.stride_0())); } @@ -1275,7 +1275,7 @@ struct BSR_GEMM_Functor { const y_value_type val_one = Kokkos::ArithTraits::one(); if (beta != val_one) { - KokkosBatched::TeamVectorScaleInternal::invoke( + KokkosBlas::Impl::TeamVectorScaleInternal::invoke( dev, block_dim, num_rhs, beta, Y_cur.data(), static_cast(Y_cur.stride_0()), static_cast(Y_cur.stride_1())); diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp index 16879444f7..d098edf0fb 100644 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp @@ -7,8 +7,7 @@ #include "KokkosBatched_Set_Decl.hpp" #include "KokkosBatched_Set_Impl.hpp" -#include "KokkosBatched_Scale_Decl.hpp" -#include "KokkosBatched_Scale_Impl.hpp" +// #include "KokkosBatched_Scale_Decl.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -40,7 +39,7 @@ struct Functor_TestBatchedTeamMatUtil { switch (TestID) { case BatchedSet: TeamSet::invoke(member, _alpha, A); break; case BatchedScale: - TeamScale::invoke(member, _alpha, A); + KokkosBlas::TeamScale::invoke(member, _alpha, A); break; } } From 9898b3d56c97512ba7cbddc5a177c0d7a9d622e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 28 Jun 2022 14:10:35 +0200 Subject: [PATCH 195/261] Fix missing headers --- src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp | 2 +- src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp | 1 + src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp | 1 + src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp | 1 + src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 1 + 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 5825d0cb60..a61d930017 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -7,7 +7,7 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBlas1_team_scal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index f8746e98b9..9f90d42f58 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -6,6 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index 407ed281db..e65bb7a28f 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -7,6 +7,7 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index a71f71dd71..49c580dabe 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -6,6 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 131924f418..313098372a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -542,6 +542,7 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosBatched_Gemv_TeamVector_Internal.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" #include "KokkosBatched_Gemm_TeamVector_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" From de4100ced5c2aed25aa782050ed5912ebb06ae6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 28 Jun 2022 14:10:52 +0200 Subject: [PATCH 196/261] Use Kokkos::abort() in deprecated interfaces --- src/batched/dense/KokkosBatched_Scale_Decl.hpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index 7b07bc06a3..128b505c06 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -3,6 +3,8 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +#include "impl/Kokkos_Error.hpp" + namespace KokkosBatched { /// @@ -13,7 +15,9 @@ struct SerialScale { template KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, const AViewType &A) { - assert(false && "Deprecated: use KokkosBlas::SerialScale"); + Kokkos::abort( + "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale " + "instead"); return 0; } }; @@ -28,7 +32,9 @@ struct TeamScale { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { - assert(false && "Deprecated: use KokkosBlas::TeamScale"); + Kokkos::abort( + "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale " + "instead"); return 0; } }; @@ -44,7 +50,9 @@ struct TeamVectorScale { const ScalarType alpha, const AViewType &A) { // static_assert(false); - assert(false && "Deprecated: use KokkosBlas::TeamVectorScale"); + Kokkos::abort( + "KokkosBatched::TeamVectorScale is deprecated: use " + "KokkosBlas::TeamVectorScale instead"); return 0; } }; From b1a266e34e08c2f10aa21d54b43fe798c6c202a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 28 Jun 2022 14:40:44 +0200 Subject: [PATCH 197/261] Gather top interfaces in single header --- src/blas/KokkosBlas1_scal.hpp | 51 ++++++++++++++ src/blas/KokkosBlas1_serial_scal.hpp | 67 ------------------- src/blas/KokkosBlas1_team_scal.hpp | 37 ---------- .../dense/Test_Batched_SerialMatUtil.hpp | 2 +- 4 files changed, 52 insertions(+), 105 deletions(-) delete mode 100644 src/blas/KokkosBlas1_serial_scal.hpp diff --git a/src/blas/KokkosBlas1_scal.hpp b/src/blas/KokkosBlas1_scal.hpp index 2fc4f92f58..d533efe535 100644 --- a/src/blas/KokkosBlas1_scal.hpp +++ b/src/blas/KokkosBlas1_scal.hpp @@ -46,9 +46,15 @@ #define KOKKOSBLAS1_SCAL_HPP_ #include +#include +#include #include #include +/// +/// General/Host Scale +/// + namespace KokkosBlas { template @@ -108,6 +114,51 @@ void scal(const RMV& R, const AV& a, const XMV& X) { R_internal, a_internal, X_internal); } +/// +/// Serial Scale +/// + +struct SerialScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType& A) { + return Impl::SerialScaleInternal::invoke( + A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + } +}; + +/// +/// Team Scale +/// + +template +struct TeamScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +/// +/// TeamVector Scale +/// + +template +struct TeamVectorScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), + A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1()); + } +}; + } // namespace KokkosBlas #endif diff --git a/src/blas/KokkosBlas1_serial_scal.hpp b/src/blas/KokkosBlas1_serial_scal.hpp deleted file mode 100644 index eacbda3079..0000000000 --- a/src/blas/KokkosBlas1_serial_scal.hpp +++ /dev/null @@ -1,67 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOSBLAS1_SERIAL_SCAL_HPP_ -#define KOKKOSBLAS1_SERIAL_SCAL_HPP_ - -#include - -namespace KokkosBlas { - -/// -/// Serial Scale -/// - -struct SerialScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A) { - return KokkosBlas::Impl::SerialScaleInternal::invoke( - A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); - } -}; - -} // namespace KokkosBlas - -#endif diff --git a/src/blas/KokkosBlas1_team_scal.hpp b/src/blas/KokkosBlas1_team_scal.hpp index af6c61f609..5fbe9688d1 100644 --- a/src/blas/KokkosBlas1_team_scal.hpp +++ b/src/blas/KokkosBlas1_team_scal.hpp @@ -45,46 +45,9 @@ #ifndef KOKKOSBLAS1_TEAM_SCAL_HPP_ #define KOKKOSBLAS1_TEAM_SCAL_HPP_ -#include - -// TODO: deprecate/remove ? #include namespace KokkosBlas { - -/// -/// Team Scale -/// - -template -struct TeamScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), - A.stride_1()); - } -}; - -/// -/// TeamVector Scale -/// - -template -struct TeamVectorScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, - const ScalarType alpha, - const AViewType& A) { - return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), - A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); - } -}; - -// TODO: deprecate/remove ? namespace Experimental { template diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp index 76d6e5a381..e6c35dffcf 100644 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp @@ -9,7 +9,7 @@ // TODO: move this test to KokkosBlas when both SerialScale and SerialSet are // moved -#include "KokkosBlas1_serial_scal.hpp" // #include "KokkosBatched_Scale_Decl.hpp" +#include "KokkosBlas1_scal.hpp" // #include "KokkosBatched_Scale_Decl.hpp" #include "KokkosKernels_TestUtils.hpp" From 6d3cbe0df002f2b2c82f86565269e6ed945dec6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 30 Jun 2022 12:14:34 +0200 Subject: [PATCH 198/261] clean up unused code --- src/batched/dense/KokkosBatched_Scale_Decl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index 128b505c06..c4e4082358 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -49,7 +49,6 @@ struct TeamVectorScale { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, const AViewType &A) { - // static_assert(false); Kokkos::abort( "KokkosBatched::TeamVectorScale is deprecated: use " "KokkosBlas::TeamVectorScale instead"); From 5bec42c580167aaebd32699060f4b9d3d5ea1215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 30 Jun 2022 12:14:49 +0200 Subject: [PATCH 199/261] Decorate [[deprecated]] batched routines --- .../dense/KokkosBatched_Scale_Decl.hpp | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index c4e4082358..f0675892fc 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -11,50 +11,49 @@ namespace KokkosBatched { /// Serial Scale /// -struct SerialScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( +struct [[deprecated]] SerialScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A){Kokkos::abort( "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale " "instead"); - return 0; - } -}; +return 0; +} // namespace KokkosBatched +} +; /// /// Team Scale /// template -struct TeamScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( +struct [[deprecated]] TeamScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A){Kokkos::abort( "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale " "instead"); - return 0; - } -}; +return 0; +} +} +; /// /// TeamVector Scale /// template -struct TeamVectorScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( - "KokkosBatched::TeamVectorScale is deprecated: use " - "KokkosBlas::TeamVectorScale instead"); - return 0; - } -}; +struct [[deprecated]] TeamVectorScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const ScalarType alpha, const AViewType &A){ + Kokkos::abort("KokkosBatched::TeamVectorScale is deprecated: use " + "KokkosBlas::TeamVectorScale instead"); +return 0; +} +} +; } // namespace KokkosBatched From b63f88b1ab3e1d9a758efc3950e88da420146c9f Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Wed, 29 Jun 2022 13:51:53 -0600 Subject: [PATCH 200/261] Removes a duplicate cuda_data_type_from when KOKKOS_HALF_T_IS_FLOAT --- src/sparse/KokkosSparse_Utils_cusparse.hpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp index 4c3ec96555..97a148007e 100644 --- a/src/sparse/KokkosSparse_Utils_cusparse.hpp +++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp @@ -126,11 +126,18 @@ cudaDataType cuda_data_type_from() { throw std::logic_error("unreachable throw after static_assert"); } +/* If half_t is not float, need to define a conversion for both + otherwise, conversion for half_t IS conversion for float +*/ #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT template <> inline cudaDataType cuda_data_type_from() { return CUDA_R_16F; // Kokkos half_t is a half } +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_32F; +} #else template <> inline cudaDataType cuda_data_type_from() { @@ -138,10 +145,6 @@ inline cudaDataType cuda_data_type_from() { } #endif template <> -inline cudaDataType cuda_data_type_from() { - return CUDA_R_32F; -} -template <> inline cudaDataType cuda_data_type_from() { return CUDA_R_64F; } From b0e5c50d2d2ff776d19c44c60e4c4abb1186c336 Mon Sep 17 00:00:00 2001 From: Carl William Pearson Date: Tue, 5 Jul 2022 14:57:28 -0600 Subject: [PATCH 201/261] simplify KOKKOS_HALF_T_IS_FLOAT guard in cusparse utils --- src/sparse/KokkosSparse_Utils_cusparse.hpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sparse/KokkosSparse_Utils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp index 97a148007e..6e9eee5ab5 100644 --- a/src/sparse/KokkosSparse_Utils_cusparse.hpp +++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp @@ -134,16 +134,13 @@ template <> inline cudaDataType cuda_data_type_from() { return CUDA_R_16F; // Kokkos half_t is a half } +#endif +// half_t is defined to be float, so this works for both half_t and float when +// half_t is float template <> inline cudaDataType cuda_data_type_from() { - return CUDA_R_32F; -} -#else -template <> -inline cudaDataType cuda_data_type_from() { return CUDA_R_32F; // Kokkos half_t is a float } -#endif template <> inline cudaDataType cuda_data_type_from() { return CUDA_R_64F; From e4c05bfe81ee890a12e91279fb4195886da585c7 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 6 Jul 2022 14:50:47 -0700 Subject: [PATCH 202/261] Try some changes --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 4 ++-- src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index a4733d5379..148f350452 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -354,8 +354,8 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::parallel_for(Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { nnz_lno_t col = static_cast(U_entries(kk)); nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; if (ipos != -1) { - auto lxu = -U_values(kk) * fact; if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); else @@ -366,8 +366,8 @@ struct ILUKLvlSchedTP1NumericFunctor { for (nnz_lno_t kk = U_row_map(prev_row) + 1 + my_thread; kk < U_row_map(prev_row + 1); kk += ts) { nnz_lno_t col = static_cast(U_entries(kk)); nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; if (ipos != -1) { - auto lxu = -U_values(kk) * fact; if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); else diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 817ee69626..79298d14ed 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -618,8 +618,13 @@ void iluk_symbolic(IlukHandle& thandle, level_list, level_ptr, level_idx, nlev); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + printf ("LEVEL SCHED on L\n"); level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, level_nchunks, level_nrowsperchunk, nlev); + level_idx, level_nchunks, level_nrowsperchunk, nlev);//ORIG + //Level scheduling on A??? + //printf ("LEVEL SCHED on A\n"); + //level_sched (thandle, A_row_map, A_entries, level_list, level_ptr, + // level_idx, level_nchunks, level_nrowsperchunk, nlev); thandle.alloc_level_nchunks(nlev); thandle.alloc_level_nrowsperchunk(nlev); From ea9b3d1ce60a23cd86bcd73be0313d39bf6ddb87 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 6 Jul 2022 19:18:36 -0400 Subject: [PATCH 203/261] HIP: fix warning from ExecSpaceUtils and GEMV Added a macro to catch return of rocBLAS functions and added logic to launch gemv on stream when using rocBLAS TPL. --- src/common/KokkosKernels_Error.hpp | 24 +++++++++++++++++++ src/common/KokkosKernels_ExecSpaceUtils.hpp | 14 ++++++----- .../tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 12 ++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/common/KokkosKernels_Error.hpp b/src/common/KokkosKernels_Error.hpp index b2f41fd4f6..11bd7f6953 100644 --- a/src/common/KokkosKernels_Error.hpp +++ b/src/common/KokkosKernels_Error.hpp @@ -54,6 +54,30 @@ inline void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +#if defined(KOKKOS_ENABLE_HIP) +inline void hip_internal_error_throw(hipError_t e, const char *name, + const char *file, const int line) { + std::ostringstream out; + out << name << " error( " << hipGetErrorName(e) + << "): " << hipGetErrorString(e); + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception(out.str()); +} + +inline void hip_internal_safe_call(hipError_t e, const char *name, + const char *file = nullptr, + const int line = 0) { + if (hipSuccess != e) { + hip_internal_error_throw(e, name, file, line); + } +} + +#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) \ + hip_internal_safe_call(call, #call, __FILE__, __LINE__) +#endif + } // namespace Impl } // namespace KokkosKernels diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 444d787963..41e750e93e 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -42,16 +42,17 @@ //@HEADER */ +#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP +#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP + #include "Kokkos_Core.hpp" +#include "KokkosKernels_Error.hpp" #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) #include #include #endif -#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP -#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP - namespace KokkosKernels { namespace Impl { @@ -64,6 +65,7 @@ enum ExecSpaceType { Exec_HIP, Exec_SYCL }; + template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { ExecSpaceType exec_space = Exec_SERIAL; @@ -205,7 +207,7 @@ inline void kk_get_free_total_memory( template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - hipMemGetInfo(&free_mem, &total_mem); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); } #endif @@ -368,12 +370,12 @@ template <> struct SpaceInstance { static Kokkos::Experimental::HIP create() { hipStream_t stream; - hipStreamCreate(&stream); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); return Kokkos::Experimental::HIP(stream); } static void destroy(Kokkos::Experimental::HIP& space) { hipStream_t stream = space.hip_stream(); - hipStreamDestroy(stream); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); } static bool overlap() { // TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING? diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 33ee439316..2d67c95c3e 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -613,9 +613,12 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -657,9 +660,12 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -702,6 +708,8 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ s.handle, transa, M, N, \ reinterpret_cast(&alpha), \ @@ -709,6 +717,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(&beta), \ reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -751,6 +760,8 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ s.handle, transa, M, N, \ reinterpret_cast(&alpha), \ @@ -758,6 +769,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(&beta), \ reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; From 7a0aaa032702fac51347483bfa2d22bedc186b0f Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 6 Jul 2022 23:37:20 -0700 Subject: [PATCH 204/261] Use LayoutRight for work view --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 148f350452..9f9b5ef73c 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -690,7 +690,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; using WorkViewType = - Kokkos::View>; + Kokkos::View>; using LevelHostViewType = Kokkos::View; size_type nlevels = thandle.get_num_levels(); From 6eea42fa52983b9d5e6b3df11c0f514c89486aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 28 Jun 2022 22:12:36 +0200 Subject: [PATCH 205/261] Move Set (Serial, Team and TeamVector) from KokkosBatched to KokkosBlas --- src/batched/dense/KokkosBatched_Set_Decl.hpp | 26 +++-- .../KokkosBatched_Gemm_Serial_Internal.hpp | 6 +- ...KokkosBatched_Gemm_TeamVector_Internal.hpp | 9 +- .../impl/KokkosBatched_Gemm_Team_Internal.hpp | 6 +- .../KokkosBatched_Gemv_Serial_Internal.hpp | 7 +- ...KokkosBatched_Gemv_TeamVector_Internal.hpp | 5 +- .../impl/KokkosBatched_Gemv_Team_Internal.hpp | 7 +- ...atched_HessenbergFormQ_Serial_Internal.hpp | 5 +- ...KokkosBatched_QR_FormQ_Serial_Internal.hpp | 2 +- ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 5 +- .../dense/impl/KokkosBatched_Set_Impl.hpp | 48 --------- ...kosBatched_ShiftedTrsv_Serial_Internal.hpp | 4 +- .../KokkosBatched_Trmm_Serial_Internal.hpp | 10 +- .../KokkosBatched_Trsm_Serial_Internal.hpp | 11 +- ...KokkosBatched_Trsm_TeamVector_Internal.hpp | 9 +- .../impl/KokkosBatched_Trsm_Team_Internal.hpp | 11 +- .../KokkosBatched_Trsv_Serial_Internal.hpp | 11 +- ...KokkosBatched_Trsv_TeamVector_Internal.hpp | 7 +- .../impl/KokkosBatched_Trsv_Team_Internal.hpp | 11 +- src/blas/KokkosBlas1_set.hpp | 101 ++++++++++++++++++ .../impl/KokkosBlas1_set_impl.hpp} | 56 +++++++++- src/blas/impl/KokkosBlas3_trmm_impl.hpp | 1 - src/blas/impl/KokkosBlas3_trsm_impl.hpp | 5 +- .../dense/Test_Batched_SerialMatUtil.hpp | 5 +- .../dense/Test_Batched_TeamMatUtil.hpp | 7 +- .../dense/Test_Batched_TeamVectorQR.hpp | 4 +- 26 files changed, 246 insertions(+), 133 deletions(-) delete mode 100644 src/batched/dense/impl/KokkosBatched_Set_Impl.hpp create mode 100644 src/blas/KokkosBlas1_set.hpp rename src/{batched/dense/impl/KokkosBatched_Set_Internal.hpp => blas/impl/KokkosBlas1_set_impl.hpp} (66%) diff --git a/src/batched/dense/KokkosBatched_Set_Decl.hpp b/src/batched/dense/KokkosBatched_Set_Decl.hpp index 4ef0078e50..29ec3013a1 100644 --- a/src/batched/dense/KokkosBatched_Set_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Set_Decl.hpp @@ -3,8 +3,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Vector.hpp" +#include "impl/Kokkos_Error.hpp" namespace KokkosBatched { /// @@ -14,7 +13,12 @@ namespace KokkosBatched { struct SerialSet { template KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + Kokkos::abort( + "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet " + "instead"); + return 0; + } }; /// @@ -26,7 +30,12 @@ struct TeamSet { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet " + "instead"); + return 0; + } }; /// @@ -38,11 +47,14 @@ struct TeamVectorSet { template KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, const ScalarType alpha, - const AViewType &A); + const AViewType &A) { + Kokkos::abort( + "KokkosBatched::TeamVectorSet is deprecated: use " + "KokkosBlas::TeamVectorSet instead"); + return 0; + } }; } // namespace KokkosBatched -#include "KokkosBatched_Set_Impl.hpp" - #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index 11d0481a9d..1548d602e2 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -5,7 +5,7 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -41,7 +41,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); @@ -81,7 +81,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index 630fcf6c02..a516f765a1 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -5,7 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -38,7 +39,8 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, + cs1); else if (beta != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); @@ -79,7 +81,8 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, + cs1); else if (beta != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index a61d930017..4f147a98fc 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -41,7 +41,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); @@ -83,7 +83,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp index 59f404dd92..ef499b82fd 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_serial_scal_impl.hpp" - #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -39,7 +38,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - SerialSetInternal ::invoke(m, zero, y, ys0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0); else if (beta != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); @@ -78,7 +77,7 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); if (beta == zero) - SerialSetInternal ::invoke(m, zero, y, ys0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0); else if (beta != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 6536a00eb7..406115aa4f 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" - #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -58,7 +57,7 @@ TeamVectorGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 9f90d42f58..cf611db5ca 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" - #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -48,7 +47,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - TeamSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); @@ -87,7 +86,7 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); if (beta == zero) - TeamSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); diff --git a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 58cd9bad2d..4c0f39097f 100644 --- a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_Serial_Internal.hpp" @@ -37,7 +37,8 @@ struct SerialHessenbergFormQInternal { /// B is m x m // set identity if (is_Q_zero) - SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, + qs0 + qs1); else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index 46feefb91b..23171c063e 100644 --- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_Serial_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index 52178a095a..13a4ef4636 100644 --- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_TeamVector_Internal.hpp" @@ -36,7 +36,8 @@ struct TeamVectorQR_FormQ_Internal { // set identity if (is_Q_zero) - TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), + Q, qs0 + qs1); else TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); diff --git a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp deleted file mode 100644 index 148e051ce4..0000000000 --- a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef __KOKKOSBATCHED_SET_IMPL_HPP__ -#define __KOKKOSBATCHED_SET_IMPL_HPP__ - -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" - -namespace KokkosBatched { - -/// -/// Serial Impl -/// =========== - -template -KOKKOS_INLINE_FUNCTION int SerialSet::invoke(const ScalarType alpha, - const AViewType &A) { - return SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); -} - -/// -/// Team Impl -/// ========= - -template -template -KOKKOS_INLINE_FUNCTION int TeamSet::invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} - -/// -/// TeamVector Impl -/// =============== - -template -template -KOKKOS_INLINE_FUNCTION int TeamVectorSet::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} -} // end namespace KokkosBatched - -#endif diff --git a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index 5fdfffe68f..c6aec99d18 100644 --- a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -5,8 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index b97a6c17c2..ac53992064 100644 --- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -47,7 +47,7 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_serial_scal_impl.hpp" namespace KokkosBatched { @@ -152,7 +152,7 @@ SerialTrmmInternalLeftLower::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); @@ -241,7 +241,7 @@ SerialTrmmInternalRightLower::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); @@ -323,7 +323,7 @@ SerialTrmmInternalLeftUpper::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); @@ -404,7 +404,7 @@ SerialTrmmInternalRightUpper::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index 409a17ddf3..b29b54931f 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_serial_scal_impl.hpp" - #include "KokkosBatched_InnerGemmFixA_Serial_Impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" @@ -39,7 +38,7 @@ SerialTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); @@ -88,7 +87,7 @@ SerialTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0), minus_one(-1.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); @@ -156,7 +155,7 @@ SerialTrsmInternalLeftUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); @@ -205,7 +204,7 @@ SerialTrsmInternalLeftUpper::invoke( constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 87d9a88122..08819e8c18 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -5,7 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -34,7 +35,8 @@ TeamVectorTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, + bs1); else { if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, @@ -96,7 +98,8 @@ TeamVectorTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, + bs1); else { if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index e65bb7a28f..f9e2bed8f8 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -6,9 +6,8 @@ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" - #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Internal.hpp" @@ -39,7 +38,7 @@ TeamTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, @@ -91,7 +90,7 @@ TeamTrsmInternalLeftLower::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, @@ -175,7 +174,7 @@ TeamTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, @@ -232,7 +231,7 @@ TeamTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index 384c183f90..926003083a 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_serial_scal_impl.hpp" - #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" @@ -42,7 +41,7 @@ SerialTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); @@ -80,7 +79,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); @@ -137,7 +136,7 @@ SerialTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); @@ -173,7 +172,7 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index 5b673b91b9..b0da8f1f2d 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -5,7 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -42,7 +43,7 @@ TeamVectorTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, @@ -106,7 +107,7 @@ TeamVectorTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index 49c580dabe..aaf72e9876 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" - #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" @@ -45,7 +44,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); @@ -91,7 +90,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); @@ -156,7 +155,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); @@ -200,7 +199,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { if (alpha != one) KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); diff --git a/src/blas/KokkosBlas1_set.hpp b/src/blas/KokkosBlas1_set.hpp new file mode 100644 index 0000000000..7a8473b2f7 --- /dev/null +++ b/src/blas/KokkosBlas1_set.hpp @@ -0,0 +1,101 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS1_SET_HPP_ +#define KOKKOSBLAS1_SET_HPP_ + +#include +// #include +// #include + +namespace KokkosBlas { + +/// +/// Serial Set +/// + +struct SerialSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A) { + return Impl::SerialSetInternal::invoke( + A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + } +}; + +/// +/// Team Set +/// + +template +struct TeamSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A) { + return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +/// +/// TeamVector Set +/// + +template +struct TeamVectorSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A) { + return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +} // namespace KokkosBlas + +#endif diff --git a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp b/src/blas/impl/KokkosBlas1_set_impl.hpp similarity index 66% rename from src/batched/dense/impl/KokkosBatched_Set_Internal.hpp rename to src/blas/impl/KokkosBlas1_set_impl.hpp index f18ac4355c..a3870a2e15 100644 --- a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp +++ b/src/blas/impl/KokkosBlas1_set_impl.hpp @@ -1,11 +1,56 @@ -#ifndef __KOKKOSBATCHED_SET_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_INTERNAL_HPP__ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBLAS_SET_IMPL_HPP__ +#define __KOKKOSBLAS_SET_IMPL_HPP__ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" +#include "Kokkos_Core.hpp" -namespace KokkosBatched { +namespace KokkosBlas { +namespace Impl { /// /// Serial Internal Impl @@ -115,6 +160,7 @@ struct TeamVectorSetInternal { } }; -} // end namespace KokkosBatched +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/src/blas/impl/KokkosBlas3_trmm_impl.hpp b/src/blas/impl/KokkosBlas3_trmm_impl.hpp index ee3e3a085d..2ba3363264 100644 --- a/src/blas/impl/KokkosBlas3_trmm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trmm_impl.hpp @@ -53,7 +53,6 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" -#include "KokkosBatched_Set_Internal.hpp" #include "KokkosBatched_Trmm_Decl.hpp" #include "KokkosBatched_Trmm_Serial_Impl.hpp" diff --git a/src/blas/impl/KokkosBlas3_trsm_impl.hpp b/src/blas/impl/KokkosBlas3_trsm_impl.hpp index 4832a74719..d85b850138 100644 --- a/src/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -54,6 +54,7 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" @@ -72,7 +73,7 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); @@ -111,7 +112,7 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp index e6c35dffcf..56939beb87 100644 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp @@ -4,8 +4,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" -#include "KokkosBatched_Set_Impl.hpp" +#include "KokkosBlas1_set.hpp" // TODO: move this test to KokkosBlas when both SerialScale and SerialSet are // moved @@ -36,7 +35,7 @@ struct Functor_TestBatchedSerialMatUtil { void operator()(const KokkosKernelTag &, const int i) const { auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); switch (TestID) { - case BatchedSet: SerialSet ::invoke(_alpha, A); break; + case BatchedSet: KokkosBlas::SerialSet::invoke(_alpha, A); break; case BatchedScale: KokkosBlas::SerialScale::invoke(_alpha, A); break; } } diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp index d098edf0fb..8a3c9939bf 100644 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp @@ -4,8 +4,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" -#include "KokkosBatched_Set_Impl.hpp" +#include "KokkosBlas1_set.hpp" // #include "KokkosBatched_Scale_Decl.hpp" @@ -37,7 +36,9 @@ struct Functor_TestBatchedTeamMatUtil { const int i = member.league_rank(); auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); switch (TestID) { - case BatchedSet: TeamSet::invoke(member, _alpha, A); break; + case BatchedSet: + KokkosBlas::TeamSet::invoke(member, _alpha, A); + break; case BatchedScale: KokkosBlas::TeamScale::invoke(member, _alpha, A); break; diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp index 4ae4ee4133..80bc7b246a 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp @@ -4,7 +4,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" +#include "KokkosBlas1_set.hpp" #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_Gemv_Decl.hpp" #include "KokkosBatched_Trsv_Decl.hpp" @@ -49,7 +49,7 @@ struct Functor_TestBatchedTeamVectorQR { [&](const int &i) { aa(i, i) += add_this; }); /// xx = 1 - TeamVectorSet::invoke(member, one, xx); + KokkosBlas::TeamVectorSet::invoke(member, one, xx); member.team_barrier(); /// bb = AA*xx From bbd50a9c116b90b19c5ce4258ec9cd41913c84f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 30 Jun 2022 12:08:40 +0200 Subject: [PATCH 206/261] Decorate [[deprecated]] batched routines --- src/batched/dense/KokkosBatched_Set_Decl.hpp | 55 ++++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Set_Decl.hpp b/src/batched/dense/KokkosBatched_Set_Decl.hpp index 29ec3013a1..fd67cdc99b 100644 --- a/src/batched/dense/KokkosBatched_Set_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Set_Decl.hpp @@ -10,50 +10,49 @@ namespace KokkosBatched { /// Serial Set /// -struct SerialSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( +struct [[deprecated]] SerialSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A){Kokkos::abort( "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet " "instead"); - return 0; - } -}; +return 0; +} // namespace KokkosBatched +} +; /// /// Team Set /// template -struct TeamSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( +struct [[deprecated]] TeamSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A){Kokkos::abort( "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet " "instead"); - return 0; - } -}; +return 0; +} +} +; /// /// TeamVector Set /// template -struct TeamVectorSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - Kokkos::abort( - "KokkosBatched::TeamVectorSet is deprecated: use " - "KokkosBlas::TeamVectorSet instead"); - return 0; - } -}; +struct [[deprecated]] TeamVectorSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const ScalarType alpha, const AViewType &A){ + Kokkos::abort("KokkosBatched::TeamVectorSet is deprecated: use " + "KokkosBlas::TeamVectorSet instead"); +return 0; +} +} +; } // namespace KokkosBatched From 255b495845c1969c3261e3128878fb8c4f7cbbbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 30 Jun 2022 12:26:31 +0200 Subject: [PATCH 207/261] remove unused headers (suggested by @e10harvey) --- src/blas/KokkosBlas1_set.hpp | 2 -- .../dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp | 1 - 2 files changed, 3 deletions(-) diff --git a/src/blas/KokkosBlas1_set.hpp b/src/blas/KokkosBlas1_set.hpp index 7a8473b2f7..61c03ec17a 100644 --- a/src/blas/KokkosBlas1_set.hpp +++ b/src/blas/KokkosBlas1_set.hpp @@ -46,8 +46,6 @@ #define KOKKOSBLAS1_SET_HPP_ #include -// #include -// #include namespace KokkosBlas { diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 3ae24bda84..72754a5e00 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -4,7 +4,6 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_ApplyPivot_Decl.hpp" #include "KokkosBatched_Gemv_Decl.hpp" From 792a31b8be4d0637d9b95b3fe576f29af5cd3507 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 8 Jul 2022 13:42:06 +0200 Subject: [PATCH 208/261] Fix deprecated Kokkos::InitArguments --- ...okkosKernels_Example_Distance2GraphColor.cpp | 4 +++- ...KokkosKernels_Example_HashmapAccumulator.cpp | 4 +++- .../blas/blas1/KokkosBlas_dot_mv_perf_test.cpp | 4 +++- .../blas/blas1/KokkosBlas_dot_perf_test.cpp | 4 +++- .../blas1/KokkosBlas_team_dot_perf_test.cpp | 4 +++- .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 4 +++- .../KokkosBlas3_gemm_standalone_perf_test.cpp | 4 +++- perf_test/graph/KokkosGraph_color.cpp | 4 +++- perf_test/graph/KokkosGraph_color_d2.cpp | 4 +++- perf_test/graph/KokkosGraph_triangle.cpp | 4 +++- perf_test/sparse/KokkosSparse_block_pcg.cpp | 8 +++++--- perf_test/sparse/KokkosSparse_pcg.cpp | 17 ++++++++--------- perf_test/sparse/KokkosSparse_spadd.cpp | 4 +++- perf_test/sparse/KokkosSparse_spgemm.cpp | 4 +++- perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp | 4 +++- 15 files changed, 52 insertions(+), 25 deletions(-) diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp index 99b398e40c..e921ed06cd 100644 --- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp +++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp @@ -526,7 +526,9 @@ int main(int argc, char* argv[]) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Print out information about the configuration of the run if verbose_level // >= 5 diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp index 9909c55720..aec112b584 100644 --- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp +++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp @@ -384,7 +384,9 @@ int main(int argc, char* argv[]) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); if (params.verbose) { Kokkos::print_configuration(std::cout); diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index a57b534f32..7b353cf160 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -199,7 +199,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index a46f4d6b20..50840ddea6 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -196,7 +196,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp index f8a2a5aa43..eeb49d6502 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp @@ -188,7 +188,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index a82ece030b..98e974229b 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -180,7 +180,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); const int device_id = std::max(params.use_cuda, params.use_hip) - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Create booleans to handle pthreads, openmp and cuda params and initialize // to true; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp index 595292ebd7..6497db8de3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp @@ -158,7 +158,9 @@ int main(int argc, char** argv) { // as number of threads const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cc19c19675..8a97d77a38 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -536,7 +536,9 @@ int main(int argc, char **argv) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = std::max(params.use_cuda, params.use_hip) - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index b47fe21a70..b824ced38a 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -632,7 +632,9 @@ int main(int argc, char* argv[]) { device_id = params.use_cuda - 1; else if (params.use_hip) device_id = params.use_hip - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Print out verbose information about the configuration of the run. // Kokkos::print_configuration(std::cout); diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 17e4a08de4..90ec6c2a61 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -296,7 +296,9 @@ int main(int argc, char **argv) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 25d7a65fdd..73f4683525 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -381,7 +381,7 @@ int main(int argc, char **argv) { int cmdline[CMD_COUNT]; char *mtx_bin_file = NULL; int block_size = 5; - struct Kokkos::InitArguments kargs; + struct Kokkos::InitializationSettings kargs; for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0; @@ -389,9 +389,11 @@ int main(int argc, char **argv) { if (0 == Test::string_compare_no_case(argv[i], "--serial")) { cmdline[CMD_USE_SERIAL] = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - kargs.num_threads = cmdline[CMD_USE_THREADS] = atoi(argv[++i]); + cmdline[CMD_USE_THREADS] = atoi(argv[++i]); + kargs.set_num_threads(cmdline[CMD_USE_THREADS]); } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - kargs.num_threads = cmdline[CMD_USE_OPENMP] = atoi(argv[++i]); + cmdline[CMD_USE_OPENMP] = atoi(argv[++i]); + kargs.set_num_threads(cmdline[CMD_USE_OPENMP]); } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { cmdline[CMD_USE_CUDA] = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--mtx")) { diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index b485158125..51c2cbb01b 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -370,17 +370,16 @@ int main(int argc, char **argv) { return 0; } - Kokkos::InitArguments init_args; // Construct with default args, change - // members based on exec space + // Construct with default args, change members based on exec space + Kokkos::InitializationSettings init_args; - init_args.device_id = cmdline[CMD_DEVICE]; + init_args.set_device_id(cmdline[CMD_DEVICE]); + init_args.set_num_threads( + std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP])); if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) { - init_args.num_threads = - std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]); - init_args.num_numa = cmdline[CMD_USE_NUMA]; - } else { - init_args.num_threads = - std::max(cmdline[CMD_USE_THREADS], cmdline[CMD_USE_OPENMP]); + KokkosKernels::Impl::throw_runtime_exception( + "NUMA init arg is no longer supported by Kokkos"); + // init_args.num_numa = cmdline[CMD_USE_NUMA]; } Kokkos::initialize(init_args); diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 5a273e6694..5448843168 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -476,7 +476,9 @@ int main(int argc, char** argv) { // as number of threads const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Kokkos::print_configuration(std::cout); // First, make sure that requested TPL (if any) is actually available diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index 9fada4caaa..da705fcdf2 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -294,7 +294,9 @@ int main(int argc, char** argv) { const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 98942acb27..aa3969e6c8 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -259,7 +259,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) From 01b14d209ffe0a208712d1fe76ff71e494b2bc4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 11 Jul 2022 20:29:48 +0200 Subject: [PATCH 209/261] Fix: don't use Kokkos private headers --- src/graph/impl/KokkosGraph_Distance2Color_impl.hpp | 1 - src/sparse/KokkosSparse_csc2csr.hpp | 9 +-------- src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp | 1 - unit_test/common/Test_Common_set_bit_count.hpp | 1 - unit_test/sparse/Test_Sparse_spgemm.hpp | 1 - unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp | 1 - unit_test/sparse/Test_Sparse_spiluk.hpp | 1 - unit_test/sparse/Test_Sparse_sptrsv.hpp | 5 ++--- 8 files changed, 3 insertions(+), 17 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp index ed40646711..c8dddcefb8 100644 --- a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -51,7 +51,6 @@ #include #include -#include #include #include diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp index 83a96c3c02..32f0c2b745 100644 --- a/src/sparse/KokkosSparse_csc2csr.hpp +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -43,14 +43,7 @@ */ #include "KokkosKernels_Utils.hpp" -#include -#include -#include -#include -#include -#include -#include -#include +#include #ifndef _KOKKOSSPARSE_CSC2CSR_HPP #define _KOKKOSSPARSE_CSC2CSR_HPP diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp index 90c35dbaf8..fd32eb08fe 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp @@ -50,7 +50,6 @@ #endif #include "KokkosKernels_Utils.hpp" -#include #include namespace KokkosSparse { diff --git a/unit_test/common/Test_Common_set_bit_count.hpp b/unit_test/common/Test_Common_set_bit_count.hpp index a085cc0024..c6163b8db4 100644 --- a/unit_test/common/Test_Common_set_bit_count.hpp +++ b/unit_test/common/Test_Common_set_bit_count.hpp @@ -48,7 +48,6 @@ #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosKernels_PrintUtils.hpp" -#include #include #include diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index a1e33c0ca6..35473046d8 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -47,7 +47,6 @@ #include "KokkosSparse_Utils.hpp" #include "KokkosSparse_SortCrs.hpp" -#include #include #include diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index f9db6f4d8d..4ac707c249 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -47,7 +47,6 @@ #include "KokkosSparse_Utils.hpp" #include "KokkosSparse_SortCrs.hpp" -#include #include #include diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp index 8f9ef99063..863bdf0808 100644 --- a/unit_test/sparse/Test_Sparse_spiluk.hpp +++ b/unit_test/sparse/Test_Sparse_spiluk.hpp @@ -45,7 +45,6 @@ #include #include -#include #include #include diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 08c5494c88..c470747202 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -45,7 +45,6 @@ #include #include -#include #include #include @@ -122,7 +121,7 @@ void run_test_sptrsv_mtx() { bool is_lower_tri = true; std::cout << "Create handle" << std::endl; kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - + std::cout << "Prepare linear system" << std::endl; // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); @@ -239,7 +238,7 @@ void run_test_sptrsv_mtx() { bool is_lower_tri = false; std::cout << "Create handle" << std::endl; kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - + std::cout << "Prepare linear system" << std::endl; // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); From 87b5723e9c5b46e39641c773fa37438bc8500956 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 11 Jul 2022 20:30:03 +0200 Subject: [PATCH 210/261] Remove unused/duplicated headers --- src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp | 1 - unit_test/common/Test_Common_set_bit_count.hpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp index fd32eb08fe..e6f0c26497 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl2phase_impl.hpp @@ -50,7 +50,6 @@ #endif #include "KokkosKernels_Utils.hpp" -#include namespace KokkosSparse { namespace Impl { diff --git a/unit_test/common/Test_Common_set_bit_count.hpp b/unit_test/common/Test_Common_set_bit_count.hpp index c6163b8db4..937a2fdf1b 100644 --- a/unit_test/common/Test_Common_set_bit_count.hpp +++ b/unit_test/common/Test_Common_set_bit_count.hpp @@ -51,9 +51,6 @@ #include #include -#include -#include - // const char *input_filename = "sherman1.mtx"; // const char *input_filename = "Si2.mtx"; // const char *input_filename = "wathen_30_30.mtx"; From 8e6986d13651ea84b965a41fca356df431a71cc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 12 Jul 2022 19:55:11 +0200 Subject: [PATCH 211/261] Fix: Use default layout for temp views in batched GESV --- src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp index 616df45df9..a9e10a1ebd 100644 --- a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -545,7 +545,6 @@ struct TeamGesv { #endif using ScratchPadMatrixViewType = Kokkos::View< typename MatrixType::non_const_value_type **, - typename MatrixType::array_layout, typename MatrixType::execution_space::scratch_memory_space>; const int n = A.extent(0); @@ -682,7 +681,6 @@ struct TeamVectorGesv { #endif using ScratchPadMatrixViewType = Kokkos::View< typename MatrixType::non_const_value_type **, - typename MatrixType::array_layout, typename MatrixType::execution_space::scratch_memory_space>; const int n = A.extent(0); From 0060bcd8d02672bf486861807fe385614bad7e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Tue, 12 Jul 2022 19:55:11 +0200 Subject: [PATCH 212/261] Fix: call print_configuration() on instances (no longer static) --- perf_test/graph/KokkosGraph_triangle.cpp | 6 +++--- perf_test/sparse/KokkosSparse_block_pcg.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 90ec6c2a61..be0b57492a 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -303,7 +303,7 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_OPENMP) if (params.use_openmp) { - Kokkos::OpenMP::print_configuration(std::cout); + Kokkos::OpenMP().print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, @@ -319,7 +319,7 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { - Kokkos::Cuda::print_configuration(std::cout); + Kokkos::Cuda().print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, @@ -335,7 +335,7 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { - Kokkos::Experimental::HIP::print_configuration(std::cout); + Kokkos::Experimental::HIP().print_configuration(std::cout); KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 73f4683525..8e453b4d01 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -437,7 +437,7 @@ int main(int argc, char **argv) { if (cmdline[CMD_USE_SERIAL]) { using myExecSpace = Kokkos::Serial; - Kokkos::Serial::print_configuration(std::cout); + myExecSpace().print_configuration(std::cout); using crsMat_t = typename KokkosSparse::CrsMatrix Date: Tue, 12 Jul 2022 16:44:05 -0600 Subject: [PATCH 213/261] trsv: remove assumptions about entry order within rows In trsv, don't assume the diagonal entry is first in its row if the matrix is upper triangular. Test this by randomly shuffling each row of the matrix used for testing. --- src/sparse/impl/KokkosSparse_trsv_impl.hpp | 35 +++++++++++++--------- test_common/KokkosKernels_TestUtils.hpp | 27 +++++++++++++++++ unit_test/sparse/Test_Sparse_trsv.hpp | 8 +++++ 3 files changed, 56 insertions(+), 14 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_trsv_impl.hpp b/src/sparse/impl/KokkosSparse_trsv_impl.hpp index f076368827..bff037c228 100644 --- a/src/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -218,6 +218,7 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, typename CrsMatrixType::row_map_type ptr = A.graph.row_map; typename CrsMatrixType::index_type ind = A.graph.entries; typename CrsMatrixType::values_type val = A.values; + typedef Kokkos::Details::ArithTraits STS; // If local_ordinal_type is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -232,15 +233,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, for (local_ordinal_type j = 0; j < numVecs; ++j) { X(r, j) = Y(r, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - // We assume the diagonal entry is first in the row. - const matrix_scalar_type A_rr = val(beg); - for (offset_type k = beg + static_cast(1); k < end; ++k) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + matrix_scalar_type A_rr = STS::zero(); + for (offset_type k = beg; k < end; ++k) { const matrix_scalar_type A_rc = val(k); const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + if (r == c) { + A_rr += A_rc; + } else { + for (local_ordinal_type j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } } } // for each entry A_rc in the current row r for (local_ordinal_type j = 0; j < numVecs; ++j) { @@ -254,15 +258,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, for (local_ordinal_type j = 0; j < numVecs; ++j) { X(r, j) = Y(r, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - // We assume the diagonal entry is first in the row. - const matrix_scalar_type A_rr = val(beg); - for (offset_type k = beg + 1; k < end; ++k) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + matrix_scalar_type A_rr = STS::zero(); + for (offset_type k = beg; k < end; ++k) { const matrix_scalar_type A_rc = val(k); const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + if (r == c) + A_rr += A_rc; + else { + for (local_ordinal_type j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } } } // for each entry A_rc in the current row r for (local_ordinal_type j = 0; j < numVecs; ++j) { diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index e7296b45a7..976da2c358 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -626,5 +626,32 @@ class RandCscMat { ColMapViewTypeD get_col_map() { return __getter_copy_helper(__col_map_d); } }; +/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) matrix. +template +void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { + using size_type = typename Rowptrs::non_const_value_type; + using ordinal_type = typename Entries::value_type; + auto rowptrsHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowptrs); + auto entriesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto valuesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), values); + ordinal_type numRows = + rowptrsHost.extent(0) ? (rowptrsHost.extent(0) - 1) : 0; + for (ordinal_type i = 0; i < numRows; i++) { + size_type rowBegin = rowptrsHost(i); + size_type rowEnd = rowptrsHost(i + 1); + for (size_type j = rowBegin; j < rowEnd - 1; j++) { + ordinal_type swapRange = rowEnd - j; + size_type swapOffset = j + (rand() % swapRange); + std::swap(entriesHost(j), entriesHost(swapOffset)); + std::swap(valuesHost(j), valuesHost(swapOffset)); + } + } + Kokkos::deep_copy(entries, entriesHost); + Kokkos::deep_copy(values, valuesHost); +} + } // namespace Test #endif diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 776674344a..938b040743 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -79,6 +79,10 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, crsMat_t lower_part = KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'L', numRows, numCols, nnz, row_size_variance, bandwidth); + + Test::shuffleMatrixEntries(lower_part.graph.row_map, lower_part.graph.entries, + lower_part.values); + KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); @@ -89,6 +93,10 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, crsMat_t upper_part = KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'U', numRows, numCols, nnz, row_size_variance, bandwidth); + + Test::shuffleMatrixEntries(upper_part.graph.row_map, upper_part.graph.entries, + upper_part.values); + KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); From f1dac18f118e1b3212992a6b194713e6b3c84c65 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 11 Jul 2022 18:34:04 -0600 Subject: [PATCH 214/261] update nightly testing scripts - Add NVIDIA Ampere Arch naming options - Update compilers/modules for weaver and caraway - Disable deprecated code by default --- cm_generate_makefile.bash | 12 ++++++++++-- scripts/cm_test_all_sandia | 25 ++++++++++++++++++++++--- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 043dcc2196..ee195ca0fe 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -274,6 +274,8 @@ display_help_text() { echo " Pascal61 = NVIDIA Pascal generation CC 6.1" echo " Volta70 = NVIDIA Volta generation CC 7.0" echo " Volta72 = NVIDIA Volta generation CC 7.2" + echo " Ampere80 = NVIDIA Ampere generation CC 8.0" + echo " Ampere86 = NVIDIA Ampere generation CC 8.6" echo "" echo "--compiler=/Path/To/Compiler Set the compiler." echo "" @@ -335,6 +337,7 @@ display_help_text() { echo "--kokkos-make-j=[NUM]: Set -j parallel level for kokkos install" echo " Default: j == 4" echo "--enable-tests: build Kokkos Kernels unit and performance tests" + echo "--deprecated-code Enable deprecated code (disabled by default)" echo "--enable-perfsuite: build Kokkos Kernels performance tests with RAJAPerf Suite" @@ -360,6 +363,8 @@ KERNELS_DEFAULT_ETI_OPTION="" WITH_CUDA_BACKEND=OFF WITH_HIP_BACKEND=OFF +KOKKOS_DEPRECATED_CODE=OFF + while [[ $# > 0 ]] do key="$1" @@ -522,6 +527,9 @@ do --disable-examples) KOKKOSKERNELS_DO_EXAMPLES=OFF ;; + --deprecated-code) + KOKKOS_DEPRECATED_CODE=ON + ;; --compiler*) COMPILER="${key#*=}" CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l) @@ -738,9 +746,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 16ef7dc9dc..db7289619d 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -20,6 +20,7 @@ print_help() { echo "--spack: Run spack builds rather than direct CMake tests" echo "" echo "--debug: Run tests in debug. Defaults to False" + echo "--deprecated-code: Enable deprecated code (disabled by default)" echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds." echo "--test-script: Test this script, not Kokkos" echo "--skip-hwloc: Do not do hwloc tests" @@ -266,6 +267,9 @@ KOKKOSKERNELS_OFFSETS="int,size_t" KOKKOSKERNELS_LAYOUTS="LayoutLeft" CTESTTIMEOUT=2500 + +KOKKOS_DEPRECATED_CODE="" + # # Handle arguments. # @@ -290,6 +294,9 @@ do --boundscheck*) KOKKOS_BOUNDS_CHECK="--boundscheck" ;; + --deprecated-code) + KOKKOS_DEPRECATED_CODE="--deprecated-code" + ;; --build-only*) BUILD_ONLY=True ;; @@ -672,6 +679,8 @@ elif [ "$MACHINE" = "weaver" ]; then GCC74_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.2.20/gcc/7.2.0,gcc/7.4.0" CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 load by default + CUDA11_MODULE_LIST="cmake/3.21.2,/,openblas/0.3.18/gcc/8.3.1" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS @@ -707,6 +716,8 @@ elif [ "$MACHINE" = "weaver" ]; then "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -756,6 +767,8 @@ elif [ "$MACHINE" = "caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake/3.19.3,/" + # Cuda11 usage available on the V100 queue + CUDA11_MODULE_LIST="cmake/3.22.2,/,gcc/8.2.0" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" @@ -763,6 +776,12 @@ elif [ "$MACHINE" = "caraway" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/4.5.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -1318,13 +1337,13 @@ single_build_and_test() { # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 if [[ "$MACHINE" = white* ]]; then From 9f38b83249b48bfe5c5a02c8ef4ea5dc2c46ba5f Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Jun 2022 18:01:06 -0600 Subject: [PATCH 215/261] KokkosSparse Utils: changing namespace The sparse utils now live in the KokkosSparse_Utils.hpp header but they have not moved to the KokkosSparse namespace which is not very consistent. The changes made here fix that issue and deprecate the only struct that was defined in KokkosKernels but not in the Impl namespace --- perf_test/graph/KokkosGraph_mis_d2.cpp | 2 +- perf_test/sparse/KokkosSparse_block_pcg.cpp | 4 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 3 +- src/graph/KokkosGraph_Distance2Color.hpp | 4 +- .../impl/KokkosGraph_Distance2MIS_impl.hpp | 2 +- src/sparse/KokkosSparse_Utils.hpp | 216 +++--------------- src/sparse/KokkosSparse_gauss_seidel.hpp | 18 +- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 6 +- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 20 +- unit_test/graph/Test_Graph_graph_color.hpp | 2 +- .../Test_Graph_graph_color_distance2.hpp | 2 +- unit_test/sparse/Test_Sparse_Transpose.hpp | 8 +- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 23 +- unit_test/sparse/Test_Sparse_spgemm.hpp | 2 +- 14 files changed, 86 insertions(+), 226 deletions(-) diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index dfe7715a1d..df5e28b315 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -258,7 +258,7 @@ void run_mis2(const MIS2Parameters& params) { std::cout << "I/O time: " << t.seconds() << " s\n"; t.reset(); // Symmetrize the matrix just in case - crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in); + crsMat_t At_in = KokkosSparse::Impl::transpose_matrix(A_in); crsMat_t A; KKH kkh; const default_scalar one = Kokkos::ArithTraits::one(); diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 8e453b4d01..5664e943fb 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -322,7 +322,7 @@ void run_experiment( // typedef typename lno_nnz_view_t::value_type lno_t; // typedef typename lno_view_t::value_type size_type; // typedef typename scalar_view_t::value_type scalar_t; - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); @@ -349,7 +349,7 @@ void run_experiment( scalar_view_t bf_v; size_t but_r, but_c; - KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs( + KokkosSparse::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs( block_size, out_r, out_c, pf_rm, pf_e, pf_v, but_r, but_c, bf_rm, bf_e, bf_v); diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index ad8e1ba8b9..fcfc66b74e 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -63,6 +63,7 @@ using namespace KokkosKernels; using namespace KokkosKernels::Impl; using namespace KokkosKernels::Experimental; using namespace KokkosSparse; +using namespace KokkosSparse::Impl; using namespace KokkosSparse::Experimental; using namespace KokkosSparse::PerfTest::Experimental; @@ -154,7 +155,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, cols_view_t entries("colmap_view", nnzL); values_view_t values("values_view", nnzL); // transpose L - transpose_matrix(nrows, nrows, row_mapM, entriesM, valuesM, row_map, entries, values); diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp index 211ad42f63..7bf19452b4 100644 --- a/src/graph/KokkosGraph_Distance2Color.hpp +++ b/src/graph/KokkosGraph_Distance2Color.hpp @@ -157,7 +157,7 @@ void bipartite_color_rows(KernelHandle *handle, // Compute the transpose col_map = TRowmap("Col map", num_columns + 1); col_entries = TEntries("Col entries", nnz); - KokkosKernels::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); } @@ -235,7 +235,7 @@ void bipartite_color_columns(KernelHandle *handle, TRowmap col_map("Col map", num_columns + 1); TEntries col_entries( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); - KokkosKernels::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); // Get unmanaged views for both graph and its transpose diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 041a2f861b..195d08dc0a 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -598,7 +598,7 @@ struct D2_MIS_FixedPriority { Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); colStatus = status_view_t( Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); - KokkosKernels::Impl::graph_min_max_degree( + KokkosSparse::Impl::graph_min_max_degree( rowmap, minDegree, maxDegree); // Compute row statuses Kokkos::parallel_for(range_pol(0, numVerts), diff --git a/src/sparse/KokkosSparse_Utils.hpp b/src/sparse/KokkosSparse_Utils.hpp index 323ae7846f..c84c928d05 100644 --- a/src/sparse/KokkosSparse_Utils.hpp +++ b/src/sparse/KokkosSparse_Utils.hpp @@ -57,9 +57,9 @@ #include #endif -namespace KokkosKernels { +namespace KokkosSparse { -enum SparseMatrixFormat { +enum SparseMatrixFormat { BlockCRS, BSR, CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there is @@ -425,11 +425,11 @@ void transpose_matrix( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, kk_get_exec_space_type()); + num_rows, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( - thread_size, kk_get_exec_space_type()); + thread_size, KokkosKernels::Impl::kk_get_exec_space_type()); TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, vals, t_xadj, t_adj, t_vals, tmp_row_view, true, team_size); @@ -439,7 +439,7 @@ void transpose_matrix( team_size, thread_size), tm); - kk_exclusive_parallel_prefix_sum(num_cols + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -508,11 +508,11 @@ void transpose_graph( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, kk_get_exec_space_type()); + num_rows, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( - thread_size, kk_get_exec_space_type()); + thread_size, KokkosKernels::Impl::kk_get_exec_space_type()); TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, tmp1, t_xadj, t_adj, tmp2, tmp_row_view, false, team_size); @@ -522,7 +522,7 @@ void transpose_graph( team_size, thread_size), tm); - kk_exclusive_parallel_prefix_sum(num_cols + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -715,7 +715,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); - kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -750,7 +750,7 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); - kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_reverse_elements + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -843,7 +843,7 @@ inline size_t kk_is_d1_coloring_valid( typename in_nnz_view_t::non_const_value_type num_rows, typename in_nnz_view_t::non_const_value_type /*num_cols*/, in_row_view_t xadj, in_nnz_view_t adj, in_color_view_t v_colors) { - ExecSpaceType my_exec_space = kk_get_exec_space_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space = KokkosKernels::Impl::kk_get_exec_space_type(); int vector_size = kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space); int suggested_team_size = @@ -926,160 +926,6 @@ void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, max_degree = result.max_val; } -/* -template -struct IncidenceMatrix{ - - struct FillTag{}; - - typedef struct FillTag FillTag; - - typedef Kokkos::TeamPolicy team_fill_policy_t ; - typedef Kokkos::TeamPolicy > dynamic_team_fill_policy_t ; typedef -typename team_fill_policy_t::member_type team_fill_member_t ; - - typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t; - typedef typename in_row_view_t::non_const_value_type size_type; - - - typename in_nnz_view_t::non_const_value_type num_rows; - in_row_view_t xadj; - in_nnz_view_t adj; - out_nnz_view_t t_adj; //allocated - typename in_row_view_t::non_const_type tmp_txadj; - nnz_lno_t team_work_size; - - IncidenceMatrix( - nnz_lno_t num_rows_, - in_row_view_t xadj_, - in_nnz_view_t adj_, - out_nnz_view_t t_adj_, - typename in_row_view_t::non_const_type tmp_txadj_, - nnz_lno_t team_row_work_size_): - num_rows(num_rows_), - xadj(xadj_), adj(adj_), - t_adj(t_adj_), - tmp_txadj(tmp_txadj_), team_work_size(team_row_work_size_) {} - - - KOKKOS_INLINE_FUNCTION - void operator()(const FillTag&, const team_fill_member_t & teamMember) const { - const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; - const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + -team_work_size, num_rows); - - - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), -[&] (const nnz_lno_t& row_index) { const size_type col_begin = xadj[row_index]; - const size_type col_end = xadj[row_index + 1]; - const nnz_lno_t left_work = col_end - col_begin; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, left_work), - [&] (nnz_lno_t i) { - const size_type adjind = i + col_begin; - const nnz_lno_t colIndex = adj[adjind]; - if (row_index < colIndex){ - - const size_type pos = -Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)),1); t_adj(adjind) = adjind; - t_adj(pos) = adjind; - } - }); - //} - }); - } -}; -*/ -/** - * \brief function returns transpose of the given graph. - * \param num_rows: num rows in input graph - * \param num_cols: num cols in input graph - * \param xadj: row pointers of the input graph - * \param adj: column indices of the input graph - * \param t_xadj: output, the row indices of the output graph. MUST BE - * INITIALIZED WITH ZEROES. \param t_adj: output, column indices. No need for - * initializations. \param vector_size: suggested vector size, optional. if -1, - * kernel will decide. \param suggested_team_size: suggested team size, - * optional. if -1, kernel will decide. \param team_work_chunk_size: suggested - * work size of a team, optional. if -1, kernel will decide. \param - * use_dynamic_scheduling: whether to use dynamic scheduling. Default is true. - */ -/* -template -inline void kk_create_incidence_matrix( - typename in_nnz_view_t::non_const_value_type num_rows, - in_row_view_t xadj, - in_nnz_view_t adj, - out_nnz_view_t i_adj, //pre-allocated -- no need for initialize -- size is -same as adj int vector_size = -1, int suggested_team_size = -1, typename -in_nnz_view_t::non_const_value_type team_work_chunk_size = -1, bool -use_dynamic_scheduling = true - ){ - - - typedef typename in_row_view_t::non_const_type tmp_row_view_t; - //allocate some memory for work for row pointers - tmp_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, -"tmp_row_view"), num_rows + 1); - - Kokkos::deep_copy(tmp_row_view, xadj); - - in_nnz_view_t tmp1; - out_nnz_view_t tmp2; - - //create the functor for tranpose. - typedef IncidenceMatrix < - in_row_view_t, in_nnz_view_t, in_nnz_view_t, - out_nnz_view_t, MyExecSpace> IncidenceMatrix_Functor_t; - - IncidenceMatrix_Functor_t tm ( num_rows, xadj, adj, - t_adj, tmp_row_view, - false, - team_work_chunk_size); - - - typedef typename IncidenceMatrix_Functor_t::team_fill_policy_t fill_tp_t; - typedef typename IncidenceMatrix_Functor_t::dynamic_team_fill_policy_t -d_fill_tp_t; - - typename in_row_view_t::non_const_value_type nnz = adj.extent(0); - - //set the vector size, if not suggested. - if (vector_size == -1) - vector_size = kk_get_suggested_vector_size(num_rows, nnz, -kk_get_exec_space_type()); - - //set the team size, if not suggested. - if (suggested_team_size == -1) - suggested_team_size = kk_get_suggested_team_size(vector_size, -kk_get_exec_space_type()); - - //set the chunk size, if not suggested. - if (team_work_chunk_size == -1) - team_work_chunk_size = suggested_team_size; - - - - if (use_dynamic_scheduling){ - Kokkos::parallel_for( fill_tp_t(num_rows / team_work_chunk_size + 1 , -suggested_team_size, vector_size), tm); - } - else { - Kokkos::parallel_for( d_fill_tp_t(num_rows / team_work_chunk_size + 1 , -suggested_team_size, vector_size), tm); - } - MyExecSpace().fence(); - -} -*/ - template void kk_get_lower_triangle_count_sequential(const lno_t nv, const size_type *in_xadj, @@ -1140,7 +986,7 @@ struct LowerTriangularMatrix { scalar_t *t_vals; const lno_t team_work_size; - const ExecSpaceType exec_space; + const KokkosKernels::Impl::ExecSpaceType exec_space; const bool is_lower; LowerTriangularMatrix(const lno_t num_rows_, const size_type *xadj_, @@ -1157,7 +1003,7 @@ struct LowerTriangularMatrix { t_adj(t_adj_), t_vals(out_vals_), team_work_size(team_row_work_size_), - exec_space(kk_get_exec_space_type()), + exec_space(KokkosKernels::Impl::kk_get_exec_space_type()), is_lower(is_lower_) {} KOKKOS_INLINE_FUNCTION @@ -1274,9 +1120,9 @@ void kk_get_lower_triangle_count_parallel( bool use_dynamic_scheduling = false, int chunksize = 4, bool is_lower = true) { const int vector_size = kk_get_suggested_vector_size( - nv, ne, kk_get_exec_space_type()); + nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, kk_get_exec_space_type()); + vector_size, KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix ltm_t; @@ -1439,9 +1285,9 @@ void kk_get_lower_triangle_fill_parallel( bool use_dynamic_scheduling = false, bool chunksize = 4, bool is_lower = true) { const int vector_size = kk_get_suggested_vector_size( - nv, ne, kk_get_exec_space_type()); + nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, kk_get_exec_space_type()); + vector_size, KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix @@ -1573,7 +1419,7 @@ crstmat_t kk_get_lower_triangle( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, new_row_map); exec_space().fence(); @@ -1630,7 +1476,7 @@ crstmat_t kk_get_lower_crs_matrix( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, new_row_map); exec_space().fence(); @@ -1683,7 +1529,7 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices); - kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, new_row_map); exec_space().fence(); @@ -1736,7 +1582,7 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), use_dynamic_scheduling, chunksize, is_lower); - kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); exec_space().fence(); @@ -1844,7 +1690,7 @@ void kk_create_incidence_matrix_from_original_matrix( permutation.data(), use_dynamic_scheduling, chunksize, sort_decreasing_order); exec_space().fence(); - kk_exclusive_parallel_prefix_sum(nr + 1, + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, out_rowmap); // kk_print_1Dview(out_rowmap, false, 20); @@ -2069,21 +1915,21 @@ template struct MatrixTraits< KokkosSparse::CrsMatrix> { - static constexpr auto format = KokkosKernels::CRS; + static constexpr auto format = KokkosSparse::CRS; }; template struct MatrixTraits> { - static constexpr auto format = KokkosKernels::BlockCRS; + static constexpr auto format = KokkosSparse::BlockCRS; }; template struct MatrixTraits> { - static constexpr auto format = KokkosKernels::BSR; + static constexpr auto format = KokkosSparse::BSR; }; template @@ -2120,6 +1966,18 @@ struct MatrixConverter { }; } // namespace Impl +} // namespace KokkosSparse + +namespace KokkosKernels { + +enum [[deprecated]] SparseMatrixFormat { + BlockCRS, + BSR, + CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there is + // no difference in value ordering (so the format tag becomes + // irrelevant) +}; + } // namespace KokkosKernels #endif diff --git a/src/sparse/KokkosSparse_gauss_seidel.hpp b/src/sparse/KokkosSparse_gauss_seidel.hpp index efe70dd1c5..1df960860b 100644 --- a/src/sparse/KokkosSparse_gauss_seidel.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel.hpp @@ -132,7 +132,7 @@ void block_gauss_seidel_symbolic( is_graph_symmetric); } -template void gauss_seidel_numeric(KernelHandle *handle, @@ -207,7 +207,7 @@ void gauss_seidel_numeric(KernelHandle *handle, is_graph_symmetric); } -template void gauss_seidel_numeric(KernelHandle *handle, @@ -286,7 +286,7 @@ void gauss_seidel_numeric(KernelHandle *handle, is_graph_symmetric); } -template void block_gauss_seidel_numeric( @@ -307,7 +307,7 @@ void block_gauss_seidel_numeric( values, is_graph_symmetric); } -template @@ -437,7 +437,7 @@ void symmetric_gauss_seidel_apply( update_y_vector, omega, numIter, true, true); } -template @@ -471,7 +471,7 @@ void symmetric_block_gauss_seidel_apply( handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter); } -template @@ -603,7 +603,7 @@ void forward_sweep_gauss_seidel_apply( update_y_vector, omega, numIter, true, false); } -template @@ -637,7 +637,7 @@ void forward_sweep_block_gauss_seidel_apply( handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter); } -template @@ -769,7 +769,7 @@ void backward_sweep_gauss_seidel_apply( update_y_vector, omega, numIter, false, true); } -template diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index abedbe80ed..137b75b3f7 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -63,7 +63,7 @@ namespace Impl { template + KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS> class PointGaussSeidel { public: typedef lno_row_view_t_ in_lno_row_view_t; @@ -137,7 +137,7 @@ class PointGaussSeidel { pool_memory_space; typedef - typename KokkosKernels::Impl::MatrixRowIndex + typename KokkosSparse::Impl::MatrixRowIndex RowIndex; private: @@ -1105,7 +1105,7 @@ class PointGaussSeidel { // std::cout << "level_2_mem:" << level_2_mem << std::endl; size_type num_large_rows = 0; - KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold< + KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( brows, permuted_xadj, num_values_in_l1, num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 182d33a2e7..5af78f96c5 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -161,7 +161,7 @@ struct GAUSS_SEIDEL_SYMBOLIC { }; template < - class KernelHandle, KokkosKernels::SparseMatrixFormat format, + class KernelHandle, KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, @@ -180,7 +180,7 @@ struct GAUSS_SEIDEL_NUMERIC { a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template struct GAUSS_SEIDEL_NUMERIC struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -416,7 +416,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -435,7 +435,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -456,7 +456,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -481,7 +481,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -502,7 +502,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp index da86546862..b9e675ef98 100644 --- a/unit_test/graph/Test_Graph_graph_color.hpp +++ b/unit_test/graph/Test_Graph_graph_color.hpp @@ -168,7 +168,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, const lno_t num_rows_1 = input_mat.numRows(); const lno_t num_cols_1 = input_mat.numCols(); - lno_t num_conflict = KokkosKernels::Impl::kk_is_d1_coloring_valid< + lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid< lno_view_t, lno_nnz_view_t, color_view_t, typename device::execution_space>( num_rows_1, num_cols_1, input_mat.graph.row_map, diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 45444cd136..bca2855fea 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -278,7 +278,7 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, auto G = A.graph; rowmap_t t_rowmap("rowmap^T", numCols + 1); entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosKernels::Impl::transpose_graph( numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); // TODO: remove me, shouldn't be needed even with UVM diff --git a/unit_test/sparse/Test_Sparse_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp index 7431d0c485..f210873999 100644 --- a/unit_test/sparse/Test_Sparse_Transpose.hpp +++ b/unit_test/sparse/Test_Sparse_Transpose.hpp @@ -104,22 +104,22 @@ void testTranspose(int numRows, int numCols, bool doValues) { Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), input_mat.values.extent(0)); if (doValues) { - KokkosKernels::Impl::transpose_matrix( numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, t_rowmap, t_entries, t_values); - KokkosKernels::Impl::transpose_matrix( numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries, tt_values); } else { - KokkosKernels::Impl::transpose_graph( numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, t_rowmap, t_entries); - KokkosKernels::Impl::transpose_graph( numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries); } diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index 0f4c9b0d67..3db10f71b1 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -71,6 +71,7 @@ using namespace KokkosKernels; using namespace KokkosKernels::Impl; using namespace KokkosKernels::Experimental; using namespace KokkosSparse; +using namespace KokkosSparse::Impl; using namespace KokkosSparse::Experimental; namespace Test { @@ -176,7 +177,7 @@ int run_block_gauss_seidel_1( } // namespace Test -template void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { @@ -212,7 +213,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); @@ -263,7 +264,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { @@ -300,7 +301,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); @@ -373,7 +374,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template void test_block_gauss_seidel_empty() { using namespace Test; @@ -421,37 +422,37 @@ void test_block_gauss_seidel_empty() { TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ + test_block_gauss_seidel_rank1( \ 500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ + test_block_gauss_seidel_rank2( \ 500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank1_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ + test_block_gauss_seidel_rank1( \ 500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ + test_block_gauss_seidel_rank2( \ 500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ + test_block_gauss_seidel_empty(); \ } #include diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index 35473046d8..f52306ef74 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -403,7 +403,7 @@ void test_issue402() { lno_view_t Browmap("B = A^T rowmap", numRows + 1); lno_nnz_view_t Bentries("B = A^T entries", nnz); scalar_view_t Bvalues("B = A^T values", nnz); - KokkosKernels::Impl::transpose_matrix< + KokkosSparse::Impl::transpose_matrix< lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, typename device::execution_space>( numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues); From 86ea70283537a14972e18d4c7a9a9e5366c989e6 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 9 Jun 2022 17:49:50 -0600 Subject: [PATCH 216/261] KokkosSparse Utils: applying clang-format --- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 8 +- src/graph/KokkosGraph_Distance2Color.hpp | 4 +- src/sparse/KokkosSparse_Utils.hpp | 74 +++++++++++-------- .../Test_Graph_graph_color_distance2.hpp | 2 +- unit_test/sparse/Test_Sparse_Transpose.hpp | 12 +-- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 35 ++++----- 6 files changed, 75 insertions(+), 60 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index fcfc66b74e..612b327d5f 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -155,10 +155,10 @@ int test_sptrsv_perf(std::vector tests, bool verbose, cols_view_t entries("colmap_view", nnzL); values_view_t values("values_view", nnzL); // transpose L - KokkosSparse::Impl::transpose_matrix(nrows, nrows, row_mapM, entriesM, - valuesM, row_map, entries, values); + KokkosSparse::Impl::transpose_matrix< + in_row_map_view_t, in_cols_view_t, in_values_view_t, row_map_view_t, + cols_view_t, values_view_t, row_map_view_t, host_execution_space>( + nrows, nrows, row_mapM, entriesM, valuesM, row_map, entries, values); // store L in CSC host_graph_t static_graph(entries, row_map); diff --git a/src/graph/KokkosGraph_Distance2Color.hpp b/src/graph/KokkosGraph_Distance2Color.hpp index 7bf19452b4..dbfd1b40e9 100644 --- a/src/graph/KokkosGraph_Distance2Color.hpp +++ b/src/graph/KokkosGraph_Distance2Color.hpp @@ -158,7 +158,7 @@ void bipartite_color_rows(KernelHandle *handle, col_map = TRowmap("Col map", num_columns + 1); col_entries = TEntries("Col entries", nnz); KokkosSparse::Impl::transpose_graph( + TRowmap, execution_space>( num_rows, num_columns, row_map, row_entries, col_map, col_entries); } InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); @@ -236,7 +236,7 @@ void bipartite_color_columns(KernelHandle *handle, TEntries col_entries( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); KokkosSparse::Impl::transpose_graph( + TRowmap, execution_space>( num_rows, num_columns, row_map, row_entries, col_map, col_entries); // Get unmanaged views for both graph and its transpose InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); diff --git a/src/sparse/KokkosSparse_Utils.hpp b/src/sparse/KokkosSparse_Utils.hpp index c84c928d05..0ad7102dd5 100644 --- a/src/sparse/KokkosSparse_Utils.hpp +++ b/src/sparse/KokkosSparse_Utils.hpp @@ -59,7 +59,7 @@ namespace KokkosSparse { -enum SparseMatrixFormat { +enum SparseMatrixFormat { BlockCRS, BSR, CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there is @@ -425,7 +425,8 @@ void transpose_matrix( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); + num_rows, nnz, + KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( @@ -439,8 +440,9 @@ void transpose_matrix( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(num_cols + 1, - t_xadj); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -508,7 +510,8 @@ void transpose_graph( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, KokkosKernels::Impl::kk_get_exec_space_type()); + num_rows, nnz, + KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( @@ -522,8 +525,9 @@ void transpose_graph( team_size, thread_size), tm); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(num_cols + 1, - t_xadj); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -715,7 +719,8 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -750,7 +755,8 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_reverse_elements + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -843,7 +849,8 @@ inline size_t kk_is_d1_coloring_valid( typename in_nnz_view_t::non_const_value_type num_rows, typename in_nnz_view_t::non_const_value_type /*num_cols*/, in_row_view_t xadj, in_nnz_view_t adj, in_color_view_t v_colors) { - KokkosKernels::Impl::ExecSpaceType my_exec_space = KokkosKernels::Impl::kk_get_exec_space_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space = + KokkosKernels::Impl::kk_get_exec_space_type(); int vector_size = kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space); int suggested_team_size = @@ -1003,7 +1010,8 @@ struct LowerTriangularMatrix { t_adj(t_adj_), t_vals(out_vals_), team_work_size(team_row_work_size_), - exec_space(KokkosKernels::Impl::kk_get_exec_space_type()), + exec_space( + KokkosKernels::Impl::kk_get_exec_space_type()), is_lower(is_lower_) {} KOKKOS_INLINE_FUNCTION @@ -1122,7 +1130,8 @@ void kk_get_lower_triangle_count_parallel( const int vector_size = kk_get_suggested_vector_size( nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, KokkosKernels::Impl::kk_get_exec_space_type()); + vector_size, + KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix ltm_t; @@ -1287,7 +1296,8 @@ void kk_get_lower_triangle_fill_parallel( const int vector_size = kk_get_suggested_vector_size( nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, KokkosKernels::Impl::kk_get_exec_space_type()); + vector_size, + KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix @@ -1419,8 +1429,9 @@ crstmat_t kk_get_lower_triangle( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1476,8 +1487,9 @@ crstmat_t kk_get_lower_crs_matrix( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1529,8 +1541,9 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1582,8 +1595,9 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), use_dynamic_scheduling, chunksize, is_lower); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, - out_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + out_rowmap); exec_space().fence(); auto ll_size = Kokkos::subview(out_rowmap, nr); @@ -1690,8 +1704,9 @@ void kk_create_incidence_matrix_from_original_matrix( permutation.data(), use_dynamic_scheduling, chunksize, sort_decreasing_order); exec_space().fence(); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, - out_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + out_rowmap); // kk_print_1Dview(out_rowmap, false, 20); @@ -1966,16 +1981,15 @@ struct MatrixConverter { }; } // namespace Impl -} // namespace KokkosSparse +} // namespace KokkosSparse namespace KokkosKernels { -enum [[deprecated]] SparseMatrixFormat { - BlockCRS, - BSR, - CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there is - // no difference in value ordering (so the format tag becomes - // irrelevant) +enum [[deprecated]] SparseMatrixFormat{ + BlockCRS, BSR, + CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there + // is no difference in value ordering (so the format tag + // becomes irrelevant) }; } // namespace KokkosKernels diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index bca2855fea..c78e8c2f5f 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -279,7 +279,7 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, rowmap_t t_rowmap("rowmap^T", numCols + 1); entries_t t_entries("entries^T", G.entries.extent(0)); KokkosSparse::Impl::transpose_graph( + entries_t, rowmap_t, execution_space>( numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); // TODO: remove me, shouldn't be needed even with UVM execution_space().fence(); diff --git a/unit_test/sparse/Test_Sparse_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp index f210873999..530614eace 100644 --- a/unit_test/sparse/Test_Sparse_Transpose.hpp +++ b/unit_test/sparse/Test_Sparse_Transpose.hpp @@ -105,22 +105,22 @@ void testTranspose(int numRows, int numCols, bool doValues) { input_mat.values.extent(0)); if (doValues) { KokkosSparse::Impl::transpose_matrix( + rowmap_t, entries_t, values_t, + rowmap_t, exec_space>( numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, t_rowmap, t_entries, t_values); KokkosSparse::Impl::transpose_matrix( + rowmap_t, entries_t, values_t, + rowmap_t, exec_space>( numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries, tt_values); } else { KokkosSparse::Impl::transpose_graph( + entries_t, rowmap_t, exec_space>( numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, t_rowmap, t_entries); KokkosSparse::Impl::transpose_graph( + entries_t, rowmap_t, exec_space>( numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries); } // Sort both the transpose-transpose, and the original matrix (to compare diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index 3db10f71b1..9092e78d79 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -177,8 +177,8 @@ int run_block_gauss_seidel_1( } // namespace Test -template +template void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; @@ -264,8 +264,8 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template +template void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; @@ -374,8 +374,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template +template void test_block_gauss_seidel_empty() { using namespace Test; typedef @@ -422,37 +422,38 @@ void test_block_gauss_seidel_empty() { TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank1(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank2(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ + test_block_gauss_seidel_empty(); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank1_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank1(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank2(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ + test_block_gauss_seidel_empty(); \ } #include From 35e2f621c2eb4b7bc858162db4532457ab292aef Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 13 Jul 2022 18:11:37 -0600 Subject: [PATCH 217/261] Sparse Utils: fixing some spelling and alias namespaces Using namespace alias is preferable to importing the content of namespaces since it avoids potential clashes. Of course using fully specified function names is also fine. Signed-off-by: Luc Berger-Vergiat --- perf_test/sparse/KokkosSparse_block_pcg.cpp | 2 +- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 33 ++++----- src/sparse/KokkosSparse_Utils.hpp | 6 +- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 68 ++++++++----------- 4 files changed, 46 insertions(+), 63 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 5664e943fb..a1758c1ae7 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -322,7 +322,7 @@ void run_experiment( // typedef typename lno_nnz_view_t::value_type lno_t; // typedef typename lno_view_t::value_type size_type; // typedef typename scalar_view_t::value_type scalar_t; - KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index 612b327d5f..b7eb39d68e 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -59,13 +59,8 @@ #include "KokkosSparse_sptrsv_aux.hpp" -using namespace KokkosKernels; -using namespace KokkosKernels::Impl; -using namespace KokkosKernels::Experimental; -using namespace KokkosSparse; -using namespace KokkosSparse::Impl; -using namespace KokkosSparse::Experimental; -using namespace KokkosSparse::PerfTest::Experimental; +namespace KSExp = KokkosSparse::Experimental; +namespace KSPTE = KokkosSparse::PerfTest::Experimental; enum { CUSPARSE, @@ -213,23 +208,23 @@ int test_sptrsv_perf(std::vector tests, bool verbose, if (test == SUPERNODAL_NAIVE) { std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, true); } else if (test == SUPERNODAL_DAG) { std::cout << " > create handle for SUPERNODAL_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, true); } else if (test == SUPERNODAL_SPMV_DAG) { std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); } // verbose (optional, default is false) @@ -255,13 +250,13 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // graph/dag) khU.get_sptrsv_handle()->set_column_major( !khL.get_sptrsv_handle()->is_column_major()); - sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph, - &khL, L.graph, &khU); + KSExp::sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph, + &khL, L.graph, &khU); // ============================================== // do numeric compute (copy numerical values from SuperLU data // structure to our sptrsv data structure) - sptrsv_compute(&khL, L); + KSExp::sptrsv_compute(&khL, L); // ============================================== // Preaparing for the first solve @@ -285,7 +280,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // ============================================== // do L solve timer.reset(); - sptrsv_solve(&khL, sol, rhs); + KSExp::sptrsv_solve(&khL, sol, rhs); Kokkos::fence(); std::cout << " > Lower-TRI: " << std::endl; std::cout << " Solve Time : " << timer.seconds() << std::endl; @@ -297,7 +292,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // Error Check ** on host ** Kokkos::fence(); std::cout << std::endl; - if (!check_errors(tol, A, rhs_host, sol_host)) { + if (!KSPTE::check_errors(tol, A, rhs_host, sol_host)) { num_failed++; } @@ -309,7 +304,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, Kokkos::fence(); for (int i = 0; i < loop; i++) { timer.reset(); - sptrsv_solve(&khL, sol, rhs); + KSExp::sptrsv_solve(&khL, sol, rhs); Kokkos::fence(); double time = timer.seconds(); ave_time += time; diff --git a/src/sparse/KokkosSparse_Utils.hpp b/src/sparse/KokkosSparse_Utils.hpp index 0ad7102dd5..db656c959b 100644 --- a/src/sparse/KokkosSparse_Utils.hpp +++ b/src/sparse/KokkosSparse_Utils.hpp @@ -72,7 +72,7 @@ namespace Impl { template -void kk_create_blockcrs_formated_point_crsmatrix( +void kk_create_blockcrs_formatted_point_crsmatrix( int block_size, size_t num_rows, size_t num_cols, in_row_view_t in_xadj, in_nnz_view_t in_adj, in_val_view_t in_vals, @@ -1958,7 +1958,7 @@ struct MatrixConverter { KokkosSparse::CrsMatrix, typename blockCrsMat_t = KokkosSparse::Experimental::BlockCrsMatrix< scalar_t, lno_t, device, void, size_type>> - static blockCrsMat_t from_blockcrs_formated_point_crsmatrix( + static blockCrsMat_t from_blockcrs_formatted_point_crsmatrix( const KokkosSparse::CrsMatrix &mtx, lno_t block_size) { @@ -1972,7 +1972,7 @@ struct MatrixConverter { typename device, typename bsrMtx_t = KokkosSparse::Experimental::BsrMatrix< scalar_t, lno_t, device, void, size_type>> - static bsrMtx_t from_blockcrs_formated_point_crsmatrix( + static bsrMtx_t from_blockcrs_formatted_point_crsmatrix( const KokkosSparse::CrsMatrix &mtx, lno_t block_size) { diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index 9092e78d79..b0c57ccf7e 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -59,20 +59,10 @@ #include #include "KokkosSparse_gauss_seidel.hpp" -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; - -using namespace KokkosKernels; -using namespace KokkosKernels::Impl; -using namespace KokkosKernels::Experimental; -using namespace KokkosSparse; -using namespace KokkosSparse::Impl; -using namespace KokkosSparse::Experimental; +namespace KSExp = KokkosSparse::Experimental; namespace Test { @@ -92,7 +82,7 @@ struct GSTestParams { // Note: GS_DEFAULT is same as GS_TEAM and - for blocks - as GS_PERMUTED // Note: GS_TWOSTAGE and GS_CLUSTER are not supported for blocks - std::vector gs_algorithms = {GS_DEFAULT}; + std::vector gs_algorithms = {KokkosSparse::GS_DEFAULT}; std::vector shmem_sizes = { 32128, 2008 // make the shmem small on gpus so that it will test 2 level @@ -121,12 +111,11 @@ int run_block_gauss_seidel_1( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - constexpr auto format = MatrixTraits::format; + constexpr auto format = KokkosSparse::Impl::MatrixTraits::format; - typedef KokkosKernelsHandle< + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename mtx_t::execution_space, - typename mtx_t::memory_space, typename mtx_t::memory_space> - KernelHandle; + typename mtx_t::memory_space, typename mtx_t::memory_space>; KernelHandle kh; kh.set_team_work_size(16); kh.set_shmem_size(shmem_size); @@ -138,33 +127,33 @@ int run_block_gauss_seidel_1( const int apply_count = 100; if (!skip_symbolic) { - block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size, + KSExp::block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, is_symmetric_graph); } if (!skip_numeric) { - block_gauss_seidel_numeric( + KSExp::block_gauss_seidel_numeric( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); } switch (apply_type) { case Test::forward_sweep: - forward_sweep_block_gauss_seidel_apply( + KSExp::forward_sweep_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); break; case Test::backward_sweep: - backward_sweep_block_gauss_seidel_apply( + KSExp::backward_sweep_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); break; case Test::symmetric: default: - symmetric_block_gauss_seidel_apply( + KSExp::symmetric_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); @@ -183,9 +172,9 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = + typename KokkosSparse::CrsMatrix; + using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -213,7 +202,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); @@ -221,7 +210,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // this converts the previous generated matrix to block matrix. auto input_mat = - MatrixConverter::from_blockcrs_formated_point_crsmatrix( + MatrixConverter::from_blockcrs_formatted_point_crsmatrix( crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -270,9 +259,9 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = + typename KokkosSparse::CrsMatrix; + using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -301,14 +290,14 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosSparse::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2); auto input_mat = - MatrixConverter::from_blockcrs_formated_point_crsmatrix( + MatrixConverter::from_blockcrs_formatted_point_crsmatrix( crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -385,10 +374,9 @@ void test_block_gauss_seidel_empty() { typedef typename graph_t::row_map_type::non_const_type row_map_type; typedef typename graph_t::entries_type::non_const_type entries_type; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef KokkosKernelsHandle< + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; + typename device::memory_space, typename device::memory_space>; // The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel // should work with both (the setup and apply are essentially no-ops but they // shouldn't crash or throw exceptions) For this test, create size-0 and @@ -396,7 +384,7 @@ void test_block_gauss_seidel_empty() { // which can trigger different bugs. for (const int rowmapLen : {0, 1, 5}) { KernelHandle kh; - kh.create_gs_handle(GS_DEFAULT); + kh.create_gs_handle(KokkosSparse::GS_DEFAULT); const auto num_rows = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1); const lno_t block_size = 1; // irrelevant (no values here) // initialized to 0 @@ -404,14 +392,14 @@ void test_block_gauss_seidel_empty() { entries_type entries("Entries", 0); scalar_view_t values("Values", 0); // also, make sure graph symmetrization doesn't crash on zero rows - block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap, + KSExp::block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap, entries, false); - block_gauss_seidel_numeric(&kh, num_rows, num_rows, block_size, + KSExp::block_gauss_seidel_numeric(&kh, num_rows, num_rows, block_size, rowmap, entries, values, false); scalar_view_t x("X", num_rows); scalar_view_t y("Y", num_rows); scalar_t omega(0.9); - symmetric_block_gauss_seidel_apply( + KSExp::symmetric_block_gauss_seidel_apply( &kh, num_rows, num_rows, block_size, rowmap, entries, values, x, y, false, true, omega, 3); kh.destroy_gs_handle(); From ee5360454d39933419fbf76262cefdb83b19674f Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Wed, 13 Jul 2022 18:44:55 -0700 Subject: [PATCH 218/261] Reformat example/fenl files changed in #1382 --- example/fenl/TestFixture.hpp | 120 +-- example/fenl/fenl_functors.hpp | 1406 ++++++++++++++++---------------- 2 files changed, 751 insertions(+), 775 deletions(-) diff --git a/example/fenl/TestFixture.hpp b/example/fenl/TestFixture.hpp index 165265b881..7c09752433 100644 --- a/example/fenl/TestFixture.hpp +++ b/example/fenl/TestFixture.hpp @@ -56,102 +56,102 @@ namespace Kokkos { namespace Example { -template< class Device > -struct FixtureVerifyElemNodeCoord -{ - typedef Device execution_space ; +template +struct FixtureVerifyElemNodeCoord { + typedef Device execution_space; - typedef struct { size_t success , error ; } value_type ; + typedef struct { + size_t success, error; + } value_type; - typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; + typedef Kokkos::Example::BoxElemFixture< + Device, Kokkos::Example::BoxElemPart::ElemLinear> + FixtureType; - FixtureType m_fixture ; + FixtureType m_fixture; KOKKOS_INLINE_FUNCTION - void init( value_type & update ) const { update.success = update.error = 0 ; } + void init(value_type& update) const { update.success = update.error = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile value_type & update , - volatile const value_type & input ) const - { - update.success += input.success ; - update.error += input.error ; - } - + void join(volatile value_type& update, + volatile const value_type& input) const { + update.success += input.success; + update.error += input.error; + } KOKKOS_INLINE_FUNCTION - void operator()( size_t ielem , value_type & update ) const - { - unsigned node_coord[ FixtureType::ElemNode ][3] ; - - for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) { - const unsigned node_id = m_fixture.elem_node(ielem,i); - node_coord[i][0] = m_fixture.node_grid(node_id,0); - node_coord[i][1] = m_fixture.node_grid(node_id,1); - node_coord[i][2] = m_fixture.node_grid(node_id,2); + void operator()(size_t ielem, value_type& update) const { + unsigned node_coord[FixtureType::ElemNode][3]; + + for (unsigned i = 0; i < FixtureType::ElemNode; ++i) { + const unsigned node_id = m_fixture.elem_node(ielem, i); + node_coord[i][0] = m_fixture.node_grid(node_id, 0); + node_coord[i][1] = m_fixture.node_grid(node_id, 1); + node_coord[i][2] = m_fixture.node_grid(node_id, 2); } - int error = 0 ; - for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) { - if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] || - node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] || - node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) { - error = 1 ; + int error = 0; + for (unsigned i = 1; i < FixtureType::ElemNode; ++i) { + if (node_coord[0][0] + m_fixture.elem_node_local(i, 0) != + node_coord[i][0] || + node_coord[0][1] + m_fixture.elem_node_local(i, 1) != + node_coord[i][1] || + node_coord[0][2] + m_fixture.elem_node_local(i, 2) != + node_coord[i][2]) { + error = 1; } } - if ( error ) { - ++update.error ; - } - else { - ++update.success ; + if (error) { + ++update.error; + } else { + ++update.success; } } - FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {} + FixtureVerifyElemNodeCoord(const FixtureType& f) : m_fixture(f) {} }; +template +void test_fixture() { + typedef Kokkos::Example::BoxElemFixture< + Device, Kokkos::Example::BoxElemPart::ElemLinear> + FixtureType; -template< class Device > -void test_fixture() -{ - typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; - - const Kokkos::Example::BoxElemPart::Decompose - decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ; - - const unsigned global_size = 256 ; - const unsigned global_nx = 400 ; - const unsigned global_ny = 400 ; - const unsigned global_nz = 400 ; + const Kokkos::Example::BoxElemPart::Decompose decompose = + Kokkos::Example::BoxElemPart::DecomposeElem; // DecomposeElem | + // DecomposeNode ; - for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) { + const unsigned global_size = 256; + const unsigned global_nx = 400; + const unsigned global_ny = 400; + const unsigned global_nz = 400; - const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz ); + for (unsigned my_rank = 0; my_rank < global_size; ++my_rank) { + const FixtureType fixture(decompose, global_size, my_rank, global_nx, + global_ny, global_nz); // Verify grid coordinates of element's nodes - - typename FixtureVerifyElemNodeCoord::value_type result = { 0 , 0 }; - Kokkos::parallel_reduce( fixture.elem_node().extent(0) , FixtureVerifyElemNodeCoord( fixture ) , result ); + typename FixtureVerifyElemNodeCoord::value_type result = {0, 0}; - if ( result.error ) { + Kokkos::parallel_reduce(fixture.elem_node().extent(0), + FixtureVerifyElemNodeCoord(fixture), + result); + + if (result.error) { std::cout << "P[" << my_rank << ":" << global_size << "] Fixture elem_node_coord" << " success(" << result.success << ")" - << " error(" << result.error << ")" - << std::endl ; + << " error(" << result.error << ")" << std::endl; } // Check send/recv alignment - - } } - } /* namespace Example */ } /* namespace Kokkos */ #endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */ - diff --git a/example/fenl/fenl_functors.hpp b/example/fenl/fenl_functors.hpp index 01a4e989da..5706497db2 100644 --- a/example/fenl/fenl_functors.hpp +++ b/example/fenl/fenl_functors.hpp @@ -69,44 +69,42 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode > +template class NodeNodeGraph { -public: + public: + typedef typename ElemNodeIdView::execution_space execution_space; + typedef pair key_type; - typedef typename ElemNodeIdView::execution_space execution_space ; - typedef pair key_type ; - - typedef Kokkos::UnorderedMap< key_type, void , execution_space > SetType ; - typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; - typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; + typedef Kokkos::UnorderedMap SetType; + typedef typename CrsGraphType::row_map_type::non_const_type RowMapType; + typedef Kokkos::View UnsignedValue; // Static dimensions of 0 generate compiler warnings or errors. - typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space > - ElemGraphType ; - -private: - - enum PhaseType { FILL_NODE_SET , - SCAN_NODE_COUNT , - FILL_GRAPH_ENTRIES , - SORT_GRAPH_ENTRIES , - FILL_ELEMENT_GRAPH }; - - const unsigned node_count ; - const ElemNodeIdView elem_node_id ; - UnsignedValue row_total ; - RowMapType row_count ; - RowMapType row_map ; - SetType node_node_set ; - PhaseType phase ; + typedef Kokkos::View + ElemGraphType; + + private: + enum PhaseType { + FILL_NODE_SET, + SCAN_NODE_COUNT, + FILL_GRAPH_ENTRIES, + SORT_GRAPH_ENTRIES, + FILL_ELEMENT_GRAPH + }; -public: + const unsigned node_count; + const ElemNodeIdView elem_node_id; + UnsignedValue row_total; + RowMapType row_count; + RowMapType row_map; + SetType node_node_set; + PhaseType phase; - CrsGraphType graph ; - ElemGraphType elem_graph ; + public: + CrsGraphType graph; + ElemGraphType elem_graph; - struct Times - { + struct Times { double ratio; double fill_node_set; double scan_node_count; @@ -115,139 +113,146 @@ class NodeNodeGraph { double fill_element_graph; }; - NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id , - const unsigned arg_node_count, - Times & results - ) - : node_count(arg_node_count) - , elem_node_id( arg_elem_node_id ) - , row_total( "row_total" ) - , row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count") , node_count ) // will deep_copy to 0 inside loop - , row_map( "graph_row_map" , node_count + 1 ) - , node_node_set() - , phase( FILL_NODE_SET ) - , graph() - , elem_graph() - { - //-------------------------------- - // Guess at span required for the map: - - Kokkos::Timer wall_clock ; - - wall_clock.reset(); - phase = FILL_NODE_SET ; - - // upper bound on the span - size_t set_span = (28ull * node_count) / 2; - - { - // Zero the row count to restart the fill - Kokkos::deep_copy( row_count , 0u ); - - node_node_set = SetType( set_span ); - - // May be larger that requested: - set_span = node_node_set.span(); - - Kokkos::parallel_for( "kokkos-kernels/example/fenl: NodeNodeGraph" , elem_node_id.extent(0) , *this ); - } + NodeNodeGraph(const ElemNodeIdView& arg_elem_node_id, + const unsigned arg_node_count, Times& results) + : node_count(arg_node_count), + elem_node_id(arg_elem_node_id), + row_total("row_total"), + row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count"), + node_count) // will deep_copy to 0 inside loop + , + row_map("graph_row_map", node_count + 1), + node_node_set(), + phase(FILL_NODE_SET), + graph(), + elem_graph() { + //-------------------------------- + // Guess at span required for the map: + + Kokkos::Timer wall_clock; + + wall_clock.reset(); + phase = FILL_NODE_SET; + + // upper bound on the span + size_t set_span = (28ull * node_count) / 2; - execution_space().fence(); - results.ratio = (double)node_node_set.size() / (double)node_node_set.span(); - results.fill_node_set = wall_clock.seconds(); - //-------------------------------- + { + // Zero the row count to restart the fill + Kokkos::deep_copy(row_count, 0u); - wall_clock.reset(); - phase = SCAN_NODE_COUNT ; + node_node_set = SetType(set_span); - // Exclusive scan of row_count into row_map - // including the final total in the 'node_count + 1' position. - // Zero the 'row_count' values. - Kokkos::parallel_scan( node_count , *this ); + // May be larger that requested: + set_span = node_node_set.span(); - // Zero the row count for the fill: - Kokkos::deep_copy( row_count , 0u ); + Kokkos::parallel_for("kokkos-kernels/example/fenl: NodeNodeGraph", + elem_node_id.extent(0), *this); + } - unsigned graph_entry_count = 0 ; + execution_space().fence(); + results.ratio = (double)node_node_set.size() / (double)node_node_set.span(); + results.fill_node_set = wall_clock.seconds(); + //-------------------------------- - Kokkos::deep_copy( graph_entry_count , row_total ); + wall_clock.reset(); + phase = SCAN_NODE_COUNT; - // Assign graph's row_map and allocate graph's entries - graph.row_map = row_map ; - graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count ); + // Exclusive scan of row_count into row_map + // including the final total in the 'node_count + 1' position. + // Zero the 'row_count' values. + Kokkos::parallel_scan(node_count, *this); - //-------------------------------- - // Fill graph's entries from the (node,node) set. + // Zero the row count for the fill: + Kokkos::deep_copy(row_count, 0u); - execution_space().fence(); - results.scan_node_count = wall_clock.seconds(); + unsigned graph_entry_count = 0; - wall_clock.reset(); - phase = FILL_GRAPH_ENTRIES ; - Kokkos::parallel_for( node_node_set.span() , *this ); + Kokkos::deep_copy(graph_entry_count, row_total); - execution_space().fence(); - results.fill_graph_entries = wall_clock.seconds(); + // Assign graph's row_map and allocate graph's entries + graph.row_map = row_map; + graph.entries = + typename CrsGraphType::entries_type("graph_entries", graph_entry_count); - //-------------------------------- - // Done with the temporary sets and arrays - wall_clock.reset(); - phase = SORT_GRAPH_ENTRIES ; + //-------------------------------- + // Fill graph's entries from the (node,node) set. - row_total = UnsignedValue(); - row_count = RowMapType(); - row_map = RowMapType(); - node_node_set.clear(); + execution_space().fence(); + results.scan_node_count = wall_clock.seconds(); - //-------------------------------- + wall_clock.reset(); + phase = FILL_GRAPH_ENTRIES; + Kokkos::parallel_for(node_node_set.span(), *this); - Kokkos::parallel_for( node_count , *this ); + execution_space().fence(); + results.fill_graph_entries = wall_clock.seconds(); - execution_space().fence(); - results.sort_graph_entries = wall_clock.seconds(); + //-------------------------------- + // Done with the temporary sets and arrays + wall_clock.reset(); + phase = SORT_GRAPH_ENTRIES; - //-------------------------------- - // Element-to-graph mapping: - wall_clock.reset(); - phase = FILL_ELEMENT_GRAPH ; - elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) ); - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); + row_total = UnsignedValue(); + row_count = RowMapType(); + row_map = RowMapType(); + node_node_set.clear(); - execution_space().fence(); - results.fill_element_graph = wall_clock.seconds(); - } + //-------------------------------- + + Kokkos::parallel_for(node_count, *this); + + execution_space().fence(); + results.sort_graph_entries = wall_clock.seconds(); + + //-------------------------------- + // Element-to-graph mapping: + wall_clock.reset(); + phase = FILL_ELEMENT_GRAPH; + elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0)); + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + execution_space().fence(); + results.fill_element_graph = wall_clock.seconds(); + } //------------------------------------ // parallel_for: create map and count row length KOKKOS_INLINE_FUNCTION - void fill_set( const unsigned ielem ) const - { + void fill_set(const unsigned ielem) const { // Loop over element's (row_local_node,col_local_node) pairs: - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { + for (unsigned col_local_node = row_local_node; + col_local_node < elem_node_id.extent(1); ++col_local_node) { + const unsigned col_node = elem_node_id(ielem, col_local_node); - const unsigned col_node = elem_node_id( ielem , col_local_node ); + // If either node is locally owned then insert the pair into the + // unordered map: - // If either node is locally owned then insert the pair into the unordered map: + if (row_node < row_count.extent(0) || col_node < row_count.extent(0)) { + const key_type key = (row_node < col_node) + ? make_pair(row_node, col_node) + : make_pair(col_node, row_node); - if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) { - - const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ; - - const typename SetType::insert_result result = node_node_set.insert( key ); + const typename SetType::insert_result result = + node_node_set.insert(key); // A successfull insert: the first time this pair was added - if ( result.success() ) { - + if (result.success()) { // If row node is owned then increment count - if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); } + if (row_node < row_count.extent(0)) { + atomic_fetch_add(&row_count(row_node), 1); + } - // If column node is owned and not equal to row node then increment count - if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); } + // If column node is owned and not equal to row node then increment + // count + if (col_node < row_count.extent(0) && col_node != row_node) { + atomic_fetch_add(&row_count(col_node), 1); + } } } } @@ -255,114 +260,113 @@ class NodeNodeGraph { } KOKKOS_INLINE_FUNCTION - void fill_graph_entries( const unsigned iset ) const - { - if ( node_node_set.valid_at(iset) ) { + void fill_graph_entries(const unsigned iset) const { + if (node_node_set.valid_at(iset)) { // Add each entry to the graph entries. - const key_type key = node_node_set.key_at(iset) ; - const unsigned row_node = key.first ; - const unsigned col_node = key.second ; + const key_type key = node_node_set.key_at(iset); + const unsigned row_node = key.first; + const unsigned col_node = key.second; - if ( row_node < row_count.extent(0) ) { - const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); - graph.entries( offset ) = col_node ; + if (row_node < row_count.extent(0)) { + const unsigned offset = + graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1); + graph.entries(offset) = col_node; } - if ( col_node < row_count.extent(0) && col_node != row_node ) { - const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 ); - graph.entries( offset ) = row_node ; + if (col_node < row_count.extent(0) && col_node != row_node) { + const unsigned offset = + graph.row_map(col_node) + atomic_fetch_add(&row_count(col_node), 1); + graph.entries(offset) = row_node; } } } KOKKOS_INLINE_FUNCTION - void sort_graph_entries( const unsigned irow ) const - { - const unsigned row_beg = graph.row_map( irow ); - const unsigned row_end = graph.row_map( irow + 1 ); - for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { + void sort_graph_entries(const unsigned irow) const { + const unsigned row_beg = graph.row_map(irow); + const unsigned row_end = graph.row_map(irow + 1); + for (unsigned i = row_beg + 1; i < row_end; ++i) { const unsigned col = graph.entries(i); - unsigned j = i ; - for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) { - graph.entries(j) = graph.entries(j-1); + unsigned j = i; + for (; row_beg < j && col < graph.entries(j - 1); --j) { + graph.entries(j) = graph.entries(j - 1); } - graph.entries(j) = col ; + graph.entries(j) = col; } } KOKKOS_INLINE_FUNCTION - void fill_elem_graph_map( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); + void fill_elem_graph_map(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { + for (unsigned col_local_node = 0; col_local_node < elem_node_id.extent(1); + ++col_local_node) { + const unsigned col_node = elem_node_id(ielem, col_local_node); - const unsigned col_node = elem_node_id( ielem , col_local_node ); + unsigned entry = ~0u; - unsigned entry = ~0u ; + if (row_node + 1 < graph.row_map.extent(0)) { + const unsigned entry_end = graph.row_map(row_node + 1); - if ( row_node + 1 < graph.row_map.extent(0) ) { + entry = graph.row_map(row_node); - const unsigned entry_end = graph.row_map( row_node + 1 ); + for (; entry < entry_end && graph.entries(entry) != col_node; ++entry) + ; - entry = graph.row_map( row_node ); - - for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry ); - - if ( entry == entry_end ) entry = ~0u ; + if (entry == entry_end) entry = ~0u; } - elem_graph( ielem , row_local_node , col_local_node ) = entry ; + elem_graph(ielem, row_local_node, col_local_node) = entry; } } } KOKKOS_INLINE_FUNCTION - void operator()( const unsigned iwork ) const - { - if ( phase == FILL_NODE_SET ) { - fill_set( iwork ); - } - else if ( phase == FILL_GRAPH_ENTRIES ) { - fill_graph_entries( iwork ); - } - else if ( phase == SORT_GRAPH_ENTRIES ) { - sort_graph_entries( iwork ); - } - else if ( phase == FILL_ELEMENT_GRAPH ) { - fill_elem_graph_map( iwork ); + void operator()(const unsigned iwork) const { + if (phase == FILL_NODE_SET) { + fill_set(iwork); + } else if (phase == FILL_GRAPH_ENTRIES) { + fill_graph_entries(iwork); + } else if (phase == SORT_GRAPH_ENTRIES) { + sort_graph_entries(iwork); + } else if (phase == FILL_ELEMENT_GRAPH) { + fill_elem_graph_map(iwork); } } //------------------------------------ // parallel_scan: row offsets - typedef unsigned value_type ; + typedef unsigned value_type; KOKKOS_INLINE_FUNCTION - void operator()( const unsigned irow , unsigned & update , const bool final ) const - { + void operator()(const unsigned irow, unsigned& update, + const bool final) const { // exclusive scan - if ( final ) { row_map( irow ) = update ; } + if (final) { + row_map(irow) = update; + } - update += row_count( irow ); + update += row_count(irow); - if ( final ) { - if ( irow + 1 == row_count.extent(0) ) { - row_map( irow + 1 ) = update ; - row_total() = update ; + if (final) { + if (irow + 1 == row_count.extent(0)) { + row_map(irow + 1) = update; + row_total() = update; } } } KOKKOS_INLINE_FUNCTION - void init( unsigned & update ) const { update = 0 ; } + void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; } + void join(volatile unsigned& update, const volatile unsigned& input) const { + update += input; + } //------------------------------------ }; @@ -377,222 +381,210 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class ElemCompType > +template class NodeElemGatherFill { -public: - - typedef typename ElemCompType::execution_space execution_space ; - typedef typename ElemCompType::vector_type vector_type ; - typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type ; - typedef typename ElemCompType::elem_node_type elem_node_type ; - typedef typename ElemCompType::elem_vectors_type elem_vectors_type ; - typedef typename ElemCompType::elem_matrices_type elem_matrices_type ; - typedef typename ElemCompType::elem_graph_type elem_graph_type ; + public: + typedef typename ElemCompType::execution_space execution_space; + typedef typename ElemCompType::vector_type vector_type; + typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type; + typedef typename ElemCompType::elem_node_type elem_node_type; + typedef typename ElemCompType::elem_vectors_type elem_vectors_type; + typedef typename ElemCompType::elem_matrices_type elem_matrices_type; + typedef typename ElemCompType::elem_graph_type elem_graph_type; - static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ; + static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount; //------------------------------------ -private: - - typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space > CrsGraphType ; - typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; - typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; - - enum PhaseType { FILL_NODE_COUNT , - SCAN_NODE_COUNT , - FILL_GRAPH_ENTRIES , - SORT_GRAPH_ENTRIES , - GATHER_FILL }; - - const elem_node_type elem_node_id ; - const elem_graph_type elem_graph ; - UnsignedValue row_total ; - RowMapType row_count ; - RowMapType row_map ; - CrsGraphType graph ; - vector_type residual ; - sparse_matrix_type jacobian ; - elem_vectors_type elem_residual ; - elem_matrices_type elem_jacobian ; - PhaseType phase ; - -public: + private: + typedef Kokkos::StaticCrsGraph CrsGraphType; + typedef typename CrsGraphType::row_map_type::non_const_type RowMapType; + typedef Kokkos::View UnsignedValue; + + enum PhaseType { + FILL_NODE_COUNT, + SCAN_NODE_COUNT, + FILL_GRAPH_ENTRIES, + SORT_GRAPH_ENTRIES, + GATHER_FILL + }; + const elem_node_type elem_node_id; + const elem_graph_type elem_graph; + UnsignedValue row_total; + RowMapType row_count; + RowMapType row_map; + CrsGraphType graph; + vector_type residual; + sparse_matrix_type jacobian; + elem_vectors_type elem_residual; + elem_matrices_type elem_jacobian; + PhaseType phase; + + public: NodeElemGatherFill() - : elem_node_id() - , elem_graph() - , row_total() - , row_count() - , row_map() - , graph() - , residual() - , jacobian() - , elem_residual() - , elem_jacobian() - , phase( FILL_NODE_COUNT ) - {} - - NodeElemGatherFill( const NodeElemGatherFill & rhs ) - : elem_node_id( rhs.elem_node_id ) - , elem_graph( rhs.elem_graph ) - , row_total( rhs.row_total ) - , row_count( rhs.row_count ) - , row_map( rhs.row_map ) - , graph( rhs.graph ) - , residual( rhs.residual ) - , jacobian( rhs.jacobian ) - , elem_residual( rhs.elem_residual ) - , elem_jacobian( rhs.elem_jacobian ) - , phase( rhs.phase ) - {} - - NodeElemGatherFill( const elem_node_type & arg_elem_node_id , - const elem_graph_type & arg_elem_graph , - const vector_type & arg_residual , - const sparse_matrix_type & arg_jacobian , - const elem_vectors_type & arg_elem_residual , - const elem_matrices_type & arg_elem_jacobian ) - : elem_node_id( arg_elem_node_id ) - , elem_graph( arg_elem_graph ) - , row_total( "row_total" ) - , row_count( "row_count" , arg_residual.extent(0) ) - , row_map( "graph_row_map" , arg_residual.extent(0) + 1 ) - , graph() - , residual( arg_residual ) - , jacobian( arg_jacobian ) - , elem_residual( arg_elem_residual ) - , elem_jacobian( arg_elem_jacobian ) - , phase( FILL_NODE_COUNT ) - { - //-------------------------------- - // Count node->element relations - - phase = FILL_NODE_COUNT ; - - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); - - //-------------------------------- - - phase = SCAN_NODE_COUNT ; - - // Exclusive scan of row_count into row_map - // including the final total in the 'node_count + 1' position. - // Zero the 'row_count' values. - Kokkos::parallel_scan( residual.extent(0) , *this ); - - // Zero the row count for the fill: - Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) ); - - unsigned graph_entry_count = 0 ; - - Kokkos::deep_copy( graph_entry_count , row_total ); - - // Assign graph's row_map and allocate graph's entries - graph.row_map = row_map ; - - typedef typename CrsGraphType::entries_type graph_entries_type ; - - graph.entries = graph_entries_type( "graph_entries" , graph_entry_count ); - - //-------------------------------- - // Fill graph's entries from the (node,node) set. - - phase = FILL_GRAPH_ENTRIES ; - - Kokkos::deep_copy( row_count , 0u ); - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); - - execution_space().fence(); - - //-------------------------------- - // Done with the temporary sets and arrays - - row_total = UnsignedValue(); - row_count = RowMapType(); - row_map = RowMapType(); - - //-------------------------------- - - phase = SORT_GRAPH_ENTRIES ; - Kokkos::parallel_for( residual.extent(0) , *this ); - - execution_space().fence(); - - phase = GATHER_FILL ; - } - - void apply() const - { - Kokkos::parallel_for( residual.extent(0) , *this ); + : elem_node_id(), + elem_graph(), + row_total(), + row_count(), + row_map(), + graph(), + residual(), + jacobian(), + elem_residual(), + elem_jacobian(), + phase(FILL_NODE_COUNT) {} + + NodeElemGatherFill(const NodeElemGatherFill& rhs) + : elem_node_id(rhs.elem_node_id), + elem_graph(rhs.elem_graph), + row_total(rhs.row_total), + row_count(rhs.row_count), + row_map(rhs.row_map), + graph(rhs.graph), + residual(rhs.residual), + jacobian(rhs.jacobian), + elem_residual(rhs.elem_residual), + elem_jacobian(rhs.elem_jacobian), + phase(rhs.phase) {} + + NodeElemGatherFill(const elem_node_type& arg_elem_node_id, + const elem_graph_type& arg_elem_graph, + const vector_type& arg_residual, + const sparse_matrix_type& arg_jacobian, + const elem_vectors_type& arg_elem_residual, + const elem_matrices_type& arg_elem_jacobian) + : elem_node_id(arg_elem_node_id), + elem_graph(arg_elem_graph), + row_total("row_total"), + row_count("row_count", arg_residual.extent(0)), + row_map("graph_row_map", arg_residual.extent(0) + 1), + graph(), + residual(arg_residual), + jacobian(arg_jacobian), + elem_residual(arg_elem_residual), + elem_jacobian(arg_elem_jacobian), + phase(FILL_NODE_COUNT) { + //-------------------------------- + // Count node->element relations + + phase = FILL_NODE_COUNT; + + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + //-------------------------------- + + phase = SCAN_NODE_COUNT; + + // Exclusive scan of row_count into row_map + // including the final total in the 'node_count + 1' position. + // Zero the 'row_count' values. + Kokkos::parallel_scan(residual.extent(0), *this); + + // Zero the row count for the fill: + Kokkos::deep_copy(row_count, typename RowMapType::value_type(0)); + + unsigned graph_entry_count = 0; + + Kokkos::deep_copy(graph_entry_count, row_total); + + // Assign graph's row_map and allocate graph's entries + graph.row_map = row_map; + + typedef typename CrsGraphType::entries_type graph_entries_type; + + graph.entries = graph_entries_type("graph_entries", graph_entry_count); + + //-------------------------------- + // Fill graph's entries from the (node,node) set. + + phase = FILL_GRAPH_ENTRIES; + + Kokkos::deep_copy(row_count, 0u); + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + execution_space().fence(); + + //-------------------------------- + // Done with the temporary sets and arrays + + row_total = UnsignedValue(); + row_count = RowMapType(); + row_map = RowMapType(); + + //-------------------------------- + + phase = SORT_GRAPH_ENTRIES; + Kokkos::parallel_for(residual.extent(0), *this); + + execution_space().fence(); + + phase = GATHER_FILL; } + void apply() const { Kokkos::parallel_for(residual.extent(0), *this); } + //------------------------------------ //------------------------------------ // parallel_for: Count node->element pairs KOKKOS_INLINE_FUNCTION - void fill_node_count( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { + void fill_node_count(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - const unsigned row_node = elem_node_id( ielem , row_local_node ); - - if ( row_node < row_count.extent(0) ) { - atomic_fetch_add( & row_count( row_node ) , 1 ); + if (row_node < row_count.extent(0)) { + atomic_fetch_add(&row_count(row_node), 1); } } } KOKKOS_INLINE_FUNCTION - void fill_graph_entries( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); - - if ( row_node < row_count.extent(0) ) { + void fill_graph_entries(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); + if (row_node < row_count.extent(0)) { + const unsigned offset = + graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1); - graph.entries( offset , 0 ) = ielem ; - graph.entries( offset , 1 ) = row_local_node ; + graph.entries(offset, 0) = ielem; + graph.entries(offset, 1) = row_local_node; } } } KOKKOS_INLINE_FUNCTION - void sort_graph_entries( const unsigned irow ) const - { - const unsigned row_beg = graph.row_map( irow ); - const unsigned row_end = graph.row_map( irow + 1 ); - for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { - const unsigned elem = graph.entries(i,0); - const unsigned local = graph.entries(i,1); - unsigned j = i ; - for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) { - graph.entries(j,0) = graph.entries(j-1,0); - graph.entries(j,1) = graph.entries(j-1,1); + void sort_graph_entries(const unsigned irow) const { + const unsigned row_beg = graph.row_map(irow); + const unsigned row_end = graph.row_map(irow + 1); + for (unsigned i = row_beg + 1; i < row_end; ++i) { + const unsigned elem = graph.entries(i, 0); + const unsigned local = graph.entries(i, 1); + unsigned j = i; + for (; row_beg < j && elem < graph.entries(j - 1, 0); --j) { + graph.entries(j, 0) = graph.entries(j - 1, 0); + graph.entries(j, 1) = graph.entries(j - 1, 1); } - graph.entries(j,0) = elem ; - graph.entries(j,1) = local ; + graph.entries(j, 0) = elem; + graph.entries(j, 1) = local; } } //------------------------------------ KOKKOS_INLINE_FUNCTION - void gather_fill( const unsigned irow ) const - { + void gather_fill(const unsigned irow) const { const unsigned node_elem_begin = graph.row_map(irow); - const unsigned node_elem_end = graph.row_map(irow+1); + const unsigned node_elem_end = graph.row_map(irow + 1); // for each element that a node belongs to - for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) { - - const unsigned elem_id = graph.entries( i, 0); - const unsigned row_index = graph.entries( i, 1); + for (unsigned i = node_elem_begin; i < node_elem_end; i++) { + const unsigned elem_id = graph.entries(i, 0); + const unsigned row_index = graph.entries(i, 1); residual(irow) += elem_residual(elem_id, row_index); @@ -600,10 +592,10 @@ class NodeElemGatherFill { // gather the contents of the element stiffness // matrix that belong in irow - for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) { - const unsigned A_index = elem_graph( elem_id , row_index , j ); + for (unsigned j = 0; j < ElemNodeCount; ++j) { + const unsigned A_index = elem_graph(elem_id, row_index, j); - jacobian.values( A_index ) += elem_jacobian( elem_id, row_index, j ); + jacobian.values(A_index) += elem_jacobian(elem_id, row_index, j); } } } @@ -611,48 +603,48 @@ class NodeElemGatherFill { //------------------------------------ KOKKOS_INLINE_FUNCTION - void operator()( const unsigned iwork ) const - { - if ( phase == FILL_NODE_COUNT ) { - fill_node_count( iwork ); - } - else if ( phase == FILL_GRAPH_ENTRIES ) { - fill_graph_entries( iwork ); - } - else if ( phase == SORT_GRAPH_ENTRIES ) { - sort_graph_entries( iwork ); - } - else if ( phase == GATHER_FILL ) { - gather_fill( iwork ); + void operator()(const unsigned iwork) const { + if (phase == FILL_NODE_COUNT) { + fill_node_count(iwork); + } else if (phase == FILL_GRAPH_ENTRIES) { + fill_graph_entries(iwork); + } else if (phase == SORT_GRAPH_ENTRIES) { + sort_graph_entries(iwork); + } else if (phase == GATHER_FILL) { + gather_fill(iwork); } } //------------------------------------ // parallel_scan: row offsets - typedef unsigned value_type ; + typedef unsigned value_type; KOKKOS_INLINE_FUNCTION - void operator()( const unsigned irow , unsigned & update , const bool final ) const - { + void operator()(const unsigned irow, unsigned& update, + const bool final) const { // exclusive scan - if ( final ) { row_map( irow ) = update ; } + if (final) { + row_map(irow) = update; + } - update += row_count( irow ); + update += row_count(irow); - if ( final ) { - if ( irow + 1 == row_count.extent(0) ) { - row_map( irow + 1 ) = update ; - row_total() = update ; + if (final) { + if (irow + 1 == row_count.extent(0)) { + row_map(irow + 1) = update; + row_total() = update; } } } KOKKOS_INLINE_FUNCTION - void init( unsigned & update ) const { update = 0 ; } + void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; } + void join(volatile unsigned& update, const volatile unsigned& input) const { + update += input; + } }; } /* namespace FENL */ @@ -665,188 +657,191 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class FiniteElementMeshType , class SparseMatrixType > -class ElementComputation ; +template +class ElementComputation; - -template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap , - typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType > +template class ElementComputation< - Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > , - KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > > -{ -public: - - typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap > mesh_type ; - typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode > element_data_type ; - - typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > sparse_matrix_type ; - typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; - - typedef DeviceType execution_space ; - typedef ScalarType scalar_type ; - - static const unsigned SpatialDim = element_data_type::spatial_dimension ; - static const unsigned TensorDim = SpatialDim * SpatialDim ; - static const unsigned ElemNodeCount = element_data_type::element_node_count ; - static const unsigned FunctionCount = element_data_type::function_count ; - static const unsigned IntegrationCount = element_data_type::integration_count ; + Kokkos::Example::BoxElemFixture, + KokkosSparse::CrsMatrix > { + public: + typedef Kokkos::Example::BoxElemFixture + mesh_type; + typedef Kokkos::Example::HexElement_Data + element_data_type; + + typedef KokkosSparse::CrsMatrix + sparse_matrix_type; + typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type; + + typedef DeviceType execution_space; + typedef ScalarType scalar_type; + + static const unsigned SpatialDim = element_data_type::spatial_dimension; + static const unsigned TensorDim = SpatialDim * SpatialDim; + static const unsigned ElemNodeCount = element_data_type::element_node_count; + static const unsigned FunctionCount = element_data_type::function_count; + static const unsigned IntegrationCount = element_data_type::integration_count; //------------------------------------ - typedef typename mesh_type::node_coord_type node_coord_type ; - typedef typename mesh_type::elem_node_type elem_node_type ; - typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ; - typedef Kokkos::View< scalar_type*[FunctionCount] , execution_space > elem_vectors_type ; - typedef Kokkos::View< scalar_type* , execution_space > vector_type ; + typedef typename mesh_type::node_coord_type node_coord_type; + typedef typename mesh_type::elem_node_type elem_node_type; + typedef Kokkos::View + elem_matrices_type; + typedef Kokkos::View + elem_vectors_type; + typedef Kokkos::View vector_type; - typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ; + typedef typename NodeNodeGraph::ElemGraphType elem_graph_type; //------------------------------------ - //------------------------------------ // Computational data: - const element_data_type elem_data ; - const elem_node_type elem_node_ids ; - const node_coord_type node_coords ; - const elem_graph_type elem_graph ; - const elem_matrices_type elem_jacobians ; - const elem_vectors_type elem_residuals ; - const vector_type solution ; - const vector_type residual ; - const sparse_matrix_type jacobian ; - const scalar_type coeff_K ; - - ElementComputation( const ElementComputation & rhs ) - : elem_data() - , elem_node_ids( rhs.elem_node_ids ) - , node_coords( rhs.node_coords ) - , elem_graph( rhs.elem_graph ) - , elem_jacobians( rhs.elem_jacobians ) - , elem_residuals( rhs.elem_residuals ) - , solution( rhs.solution ) - , residual( rhs.residual ) - , jacobian( rhs.jacobian ) - , coeff_K( rhs.coeff_K ) - {} + const element_data_type elem_data; + const elem_node_type elem_node_ids; + const node_coord_type node_coords; + const elem_graph_type elem_graph; + const elem_matrices_type elem_jacobians; + const elem_vectors_type elem_residuals; + const vector_type solution; + const vector_type residual; + const sparse_matrix_type jacobian; + const scalar_type coeff_K; + + ElementComputation(const ElementComputation& rhs) + : elem_data(), + elem_node_ids(rhs.elem_node_ids), + node_coords(rhs.node_coords), + elem_graph(rhs.elem_graph), + elem_jacobians(rhs.elem_jacobians), + elem_residuals(rhs.elem_residuals), + solution(rhs.solution), + residual(rhs.residual), + jacobian(rhs.jacobian), + coeff_K(rhs.coeff_K) {} // If the element->sparse_matrix graph is provided then perform atomic updates - // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian. - ElementComputation( const mesh_type & arg_mesh , - const scalar_type arg_coeff_K , - const vector_type & arg_solution , - const elem_graph_type & arg_elem_graph , - const sparse_matrix_type & arg_jacobian , - const vector_type & arg_residual ) - : elem_data() - , elem_node_ids( arg_mesh.elem_node() ) - , node_coords( arg_mesh.node_coord() ) - , elem_graph( arg_elem_graph ) - , elem_jacobians() - , elem_residuals() - , solution( arg_solution ) - , residual( arg_residual ) - , jacobian( arg_jacobian ) - , coeff_K( arg_coeff_K ) - {} - - ElementComputation( const mesh_type & arg_mesh , - const scalar_type arg_coeff_K , - const vector_type & arg_solution ) - : elem_data() - , elem_node_ids( arg_mesh.elem_node() ) - , node_coords( arg_mesh.node_coord() ) - , elem_graph() - , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() ) - , elem_residuals( "elem_residuals" , arg_mesh.elem_count() ) - , solution( arg_solution ) - , residual() - , jacobian() - , coeff_K( arg_coeff_K ) - {} + // Otherwise fill per-element contributions for subequent gather-add into a + // residual and jacobian. + ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K, + const vector_type& arg_solution, + const elem_graph_type& arg_elem_graph, + const sparse_matrix_type& arg_jacobian, + const vector_type& arg_residual) + : elem_data(), + elem_node_ids(arg_mesh.elem_node()), + node_coords(arg_mesh.node_coord()), + elem_graph(arg_elem_graph), + elem_jacobians(), + elem_residuals(), + solution(arg_solution), + residual(arg_residual), + jacobian(arg_jacobian), + coeff_K(arg_coeff_K) {} + + ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K, + const vector_type& arg_solution) + : elem_data(), + elem_node_ids(arg_mesh.elem_node()), + node_coords(arg_mesh.node_coord()), + elem_graph(), + elem_jacobians("elem_jacobians", arg_mesh.elem_count()), + elem_residuals("elem_residuals", arg_mesh.elem_count()), + solution(arg_solution), + residual(), + jacobian(), + coeff_K(arg_coeff_K) {} //------------------------------------ - void apply() const - { - parallel_for( elem_node_ids.extent(0) , *this ); - } + void apply() const { parallel_for(elem_node_ids.extent(0), *this); } //------------------------------------ static const unsigned FLOPS_transform_gradients = - /* Jacobian */ FunctionCount * TensorDim * 2 + - /* Inverse jacobian */ TensorDim * 6 + 6 + - /* Gradient transform */ FunctionCount * 15 ; + /* Jacobian */ FunctionCount * TensorDim * 2 + + /* Inverse jacobian */ TensorDim * 6 + 6 + + /* Gradient transform */ FunctionCount * 15; KOKKOS_INLINE_FUNCTION float transform_gradients( - const float grad[][ FunctionCount ] , // Gradient of bases master element - const double x[] , - const double y[] , - const double z[] , - float dpsidx[] , - float dpsidy[] , - float dpsidz[] ) const - { - enum { j11 = 0 , j12 = 1 , j13 = 2 , - j21 = 3 , j22 = 4 , j23 = 5 , - j31 = 6 , j32 = 7 , j33 = 8 }; + const float grad[][FunctionCount], // Gradient of bases master element + const double x[], const double y[], const double z[], float dpsidx[], + float dpsidy[], float dpsidz[]) const { + enum { + j11 = 0, + j12 = 1, + j13 = 2, + j21 = 3, + j22 = 4, + j23 = 5, + j31 = 6, + j32 = 7, + j33 = 8 + }; // Jacobian accumulation: - double J[ TensorDim ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double J[TensorDim] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - for( unsigned i = 0; i < FunctionCount ; ++i ) { - const double x1 = x[i] ; - const double x2 = y[i] ; - const double x3 = z[i] ; + for (unsigned i = 0; i < FunctionCount; ++i) { + const double x1 = x[i]; + const double x2 = y[i]; + const double x3 = z[i]; - const float g1 = grad[0][i] ; - const float g2 = grad[1][i] ; - const float g3 = grad[2][i] ; + const float g1 = grad[0][i]; + const float g2 = grad[1][i]; + const float g3 = grad[2][i]; - J[j11] += g1 * x1 ; - J[j12] += g1 * x2 ; - J[j13] += g1 * x3 ; + J[j11] += g1 * x1; + J[j12] += g1 * x2; + J[j13] += g1 * x3; - J[j21] += g2 * x1 ; - J[j22] += g2 * x2 ; - J[j23] += g2 * x3 ; + J[j21] += g2 * x1; + J[j22] += g2 * x2; + J[j23] += g2 * x3; - J[j31] += g3 * x1 ; - J[j32] += g3 * x2 ; - J[j33] += g3 * x3 ; + J[j31] += g3 * x1; + J[j32] += g3 * x2; + J[j33] += g3 * x3; } // Inverse jacobian: - float invJ[ TensorDim ] = { - static_cast( J[j22] * J[j33] - J[j23] * J[j32] ) , - static_cast( J[j13] * J[j32] - J[j12] * J[j33] ) , - static_cast( J[j12] * J[j23] - J[j13] * J[j22] ) , + float invJ[TensorDim] = { + static_cast(J[j22] * J[j33] - J[j23] * J[j32]), + static_cast(J[j13] * J[j32] - J[j12] * J[j33]), + static_cast(J[j12] * J[j23] - J[j13] * J[j22]), - static_cast( J[j23] * J[j31] - J[j21] * J[j33] ) , - static_cast( J[j11] * J[j33] - J[j13] * J[j31] ) , - static_cast( J[j13] * J[j21] - J[j11] * J[j23] ) , + static_cast(J[j23] * J[j31] - J[j21] * J[j33]), + static_cast(J[j11] * J[j33] - J[j13] * J[j31]), + static_cast(J[j13] * J[j21] - J[j11] * J[j23]), - static_cast( J[j21] * J[j32] - J[j22] * J[j31] ) , - static_cast( J[j12] * J[j31] - J[j11] * J[j32] ) , - static_cast( J[j11] * J[j22] - J[j12] * J[j21] ) }; + static_cast(J[j21] * J[j32] - J[j22] * J[j31]), + static_cast(J[j12] * J[j31] - J[j11] * J[j32]), + static_cast(J[j11] * J[j22] - J[j12] * J[j21])}; - const float detJ = J[j11] * invJ[j11] + - J[j21] * invJ[j12] + - J[j31] * invJ[j13] ; + const float detJ = + J[j11] * invJ[j11] + J[j21] * invJ[j12] + J[j31] * invJ[j13]; - const float detJinv = 1.0 / detJ ; + const float detJinv = 1.0 / detJ; - for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; } + for (unsigned i = 0; i < TensorDim; ++i) { + invJ[i] *= detJinv; + } // Transform gradients: - for( unsigned i = 0; i < FunctionCount ; ++i ) { + for (unsigned i = 0; i < FunctionCount; ++i) { const float g0 = grad[0][i]; const float g1 = grad[1][i]; const float g2 = grad[2][i]; @@ -856,113 +851,101 @@ class ElementComputation< dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33]; } - return detJ ; + return detJ; } KOKKOS_INLINE_FUNCTION - void contributeResidualJacobian( - const float coeff_k , - const double dof_values[] , - const float dpsidx[] , - const float dpsidy[] , - const float dpsidz[] , - const float detJ , - const float integ_weight , - const float bases_vals[] , - double elem_res[] , - double elem_mat[][ FunctionCount ] ) const - { - double value_at_pt = 0 ; - double gradx_at_pt = 0 ; - double grady_at_pt = 0 ; - double gradz_at_pt = 0 ; - - for ( unsigned m = 0 ; m < FunctionCount ; m++ ) { - value_at_pt += dof_values[m] * bases_vals[m] ; - gradx_at_pt += dof_values[m] * dpsidx[m] ; - grady_at_pt += dof_values[m] * dpsidy[m] ; - gradz_at_pt += dof_values[m] * dpsidz[m] ; + void contributeResidualJacobian(const float coeff_k, + const double dof_values[], + const float dpsidx[], const float dpsidy[], + const float dpsidz[], const float detJ, + const float integ_weight, + const float bases_vals[], double elem_res[], + double elem_mat[][FunctionCount]) const { + double value_at_pt = 0; + double gradx_at_pt = 0; + double grady_at_pt = 0; + double gradz_at_pt = 0; + + for (unsigned m = 0; m < FunctionCount; m++) { + value_at_pt += dof_values[m] * bases_vals[m]; + gradx_at_pt += dof_values[m] * dpsidx[m]; + grady_at_pt += dof_values[m] * dpsidy[m]; + gradz_at_pt += dof_values[m] * dpsidz[m]; } - const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight ; - const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ; - const double mat_val = 2.0 * value_at_pt * detJ * integ_weight ; + const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight; + const double res_val = value_at_pt * value_at_pt * detJ * integ_weight; + const double mat_val = 2.0 * value_at_pt * detJ * integ_weight; - // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ - // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ + // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d + // \Omega $$ + // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla + // \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ - for ( unsigned m = 0; m < FunctionCount; ++m) { - double * const mat = elem_mat[m] ; + for (unsigned m = 0; m < FunctionCount; ++m) { + double* const mat = elem_mat[m]; const float bases_val_m = bases_vals[m]; - const float dpsidx_m = dpsidx[m] ; - const float dpsidy_m = dpsidy[m] ; - const float dpsidz_m = dpsidz[m] ; - - elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt + - dpsidy_m * grady_at_pt + - dpsidz_m * gradz_at_pt ) + - res_val * bases_val_m ; - - for( unsigned n = 0; n < FunctionCount; n++) { - - mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] + - dpsidy_m * dpsidy[n] + - dpsidz_m * dpsidz[n] ) + + const float dpsidx_m = dpsidx[m]; + const float dpsidy_m = dpsidy[m]; + const float dpsidz_m = dpsidz[m]; + + elem_res[m] += + k_detJ_weight * (dpsidx_m * gradx_at_pt + dpsidy_m * grady_at_pt + + dpsidz_m * gradz_at_pt) + + res_val * bases_val_m; + + for (unsigned n = 0; n < FunctionCount; n++) { + mat[n] += k_detJ_weight * (dpsidx_m * dpsidx[n] + dpsidy_m * dpsidy[n] + + dpsidz_m * dpsidz[n]) + mat_val * bases_val_m * bases_vals[n]; } } } KOKKOS_INLINE_FUNCTION - void operator()( const unsigned ielem ) const - { + void operator()(const unsigned ielem) const { // Gather nodal coordinates and solution vector: - double x[ FunctionCount ] ; - double y[ FunctionCount ] ; - double z[ FunctionCount ] ; - double val[ FunctionCount ] ; - unsigned node_index[ ElemNodeCount ]; + double x[FunctionCount]; + double y[FunctionCount]; + double z[FunctionCount]; + double val[FunctionCount]; + unsigned node_index[ElemNodeCount]; - for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) { - const unsigned ni = elem_node_ids( ielem , i ); + for (unsigned i = 0; i < ElemNodeCount; ++i) { + const unsigned ni = elem_node_ids(ielem, i); - node_index[i] = ni ; + node_index[i] = ni; - x[i] = node_coords( ni , 0 ); - y[i] = node_coords( ni , 1 ); - z[i] = node_coords( ni , 2 ); + x[i] = node_coords(ni, 0); + y[i] = node_coords(ni, 1); + z[i] = node_coords(ni, 2); - val[i] = solution( ni ); + val[i] = solution(ni); } + double elem_vec[FunctionCount]; + double elem_mat[FunctionCount][FunctionCount]; - double elem_vec[ FunctionCount ] ; - double elem_mat[ FunctionCount ][ FunctionCount ] ; - - for( unsigned i = 0; i < FunctionCount ; i++ ) { - elem_vec[i] = 0 ; - for( unsigned j = 0; j < FunctionCount ; j++){ - elem_mat[i][j] = 0 ; + for (unsigned i = 0; i < FunctionCount; i++) { + elem_vec[i] = 0; + for (unsigned j = 0; j < FunctionCount; j++) { + elem_mat[i][j] = 0; } } + for (unsigned i = 0; i < IntegrationCount; ++i) { + float dpsidx[FunctionCount]; + float dpsidy[FunctionCount]; + float dpsidz[FunctionCount]; - for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) { - float dpsidx[ FunctionCount ] ; - float dpsidy[ FunctionCount ] ; - float dpsidz[ FunctionCount ] ; + const float detJ = transform_gradients(elem_data.gradients[i], x, y, z, + dpsidx, dpsidy, dpsidz); - const float detJ = - transform_gradients( elem_data.gradients[i] , x , y , z , - dpsidx , dpsidy , dpsidz ); - - contributeResidualJacobian( coeff_K , - val , dpsidx , dpsidy , dpsidz , - detJ , - elem_data.weights[i] , - elem_data.values[i] , - elem_vec , elem_mat ); + contributeResidualJacobian(coeff_K, val, dpsidx, dpsidy, dpsidz, detJ, + elem_data.weights[i], elem_data.values[i], + elem_vec, elem_mat); } #if 0 @@ -984,24 +967,23 @@ if ( 1 == ielem ) { #endif - if ( ! residual.extent(0) ) { - for( unsigned i = 0; i < FunctionCount ; i++){ - elem_residuals(ielem, i) = elem_vec[i] ; - for( unsigned j = 0; j < FunctionCount ; j++){ - elem_jacobians(ielem, i, j) = elem_mat[i][j] ; + if (!residual.extent(0)) { + for (unsigned i = 0; i < FunctionCount; i++) { + elem_residuals(ielem, i) = elem_vec[i]; + for (unsigned j = 0; j < FunctionCount; j++) { + elem_jacobians(ielem, i, j) = elem_mat[i][j]; } } - } - else { - for( unsigned i = 0 ; i < FunctionCount ; i++ ) { - const unsigned row = node_index[i] ; - if ( row < residual.extent(0) ) { - atomic_fetch_add( & residual( row ) , elem_vec[i] ); - - for( unsigned j = 0 ; j < FunctionCount ; j++ ) { - const unsigned entry = elem_graph( ielem , i , j ); - if ( entry != ~0u ) { - atomic_fetch_add( & jacobian.values( entry ) , elem_mat[i][j] ); + } else { + for (unsigned i = 0; i < FunctionCount; i++) { + const unsigned row = node_index[i]; + if (row < residual.extent(0)) { + atomic_fetch_add(&residual(row), elem_vec[i]); + + for (unsigned j = 0; j < FunctionCount; j++) { + const unsigned entry = elem_graph(ielem, i, j); + if (entry != ~0u) { + atomic_fetch_add(&jacobian.values(entry), elem_mat[i][j]); } } } @@ -1012,119 +994,114 @@ if ( 1 == ielem ) { //---------------------------------------------------------------------------- -template< class FixtureType , class SparseMatrixType > -class DirichletComputation ; +template +class DirichletComputation; -template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap , - typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType > +template class DirichletComputation< - Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > , - KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > > -{ -public: - - typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap > mesh_type ; - typedef typename mesh_type::node_coord_type node_coord_type ; - typedef typename node_coord_type::value_type scalar_coord_type ; - - typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > sparse_matrix_type ; - typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; - - typedef DeviceType execution_space ; - typedef ScalarType scalar_type ; + Kokkos::Example::BoxElemFixture, + KokkosSparse::CrsMatrix > { + public: + typedef Kokkos::Example::BoxElemFixture + mesh_type; + typedef typename mesh_type::node_coord_type node_coord_type; + typedef typename node_coord_type::value_type scalar_coord_type; + + typedef KokkosSparse::CrsMatrix + sparse_matrix_type; + typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type; + + typedef DeviceType execution_space; + typedef ScalarType scalar_type; //------------------------------------ - typedef Kokkos::View< scalar_type* , execution_space > vector_type ; + typedef Kokkos::View vector_type; //------------------------------------ // Computational data: - const node_coord_type node_coords ; - const vector_type solution ; - const sparse_matrix_type jacobian ; - const vector_type residual ; - const scalar_type bc_lower_value ; - const scalar_type bc_upper_value ; - const scalar_coord_type bc_lower_limit ; - const scalar_coord_type bc_upper_limit ; - const unsigned bc_plane ; - const unsigned node_count ; - bool init ; - - - DirichletComputation( const mesh_type & arg_mesh , - const vector_type & arg_solution , - const sparse_matrix_type & arg_jacobian , - const vector_type & arg_residual , - const unsigned arg_bc_plane , - const scalar_type arg_bc_lower_value , - const scalar_type arg_bc_upper_value ) - : node_coords( arg_mesh.node_coord() ) - , solution( arg_solution ) - , jacobian( arg_jacobian ) - , residual( arg_residual ) - , bc_lower_value( arg_bc_lower_value ) - , bc_upper_value( arg_bc_upper_value ) - , bc_lower_limit( std::numeric_limits::epsilon() ) - , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits::epsilon() ) - , bc_plane( arg_bc_plane ) - , node_count( arg_mesh.node_count_owned() ) - , init( false ) - { - parallel_for( node_count , *this ); - init = true ; - } - - void apply() const - { - parallel_for( node_count , *this ); + const node_coord_type node_coords; + const vector_type solution; + const sparse_matrix_type jacobian; + const vector_type residual; + const scalar_type bc_lower_value; + const scalar_type bc_upper_value; + const scalar_coord_type bc_lower_limit; + const scalar_coord_type bc_upper_limit; + const unsigned bc_plane; + const unsigned node_count; + bool init; + + DirichletComputation(const mesh_type& arg_mesh, + const vector_type& arg_solution, + const sparse_matrix_type& arg_jacobian, + const vector_type& arg_residual, + const unsigned arg_bc_plane, + const scalar_type arg_bc_lower_value, + const scalar_type arg_bc_upper_value) + : node_coords(arg_mesh.node_coord()), + solution(arg_solution), + jacobian(arg_jacobian), + residual(arg_residual), + bc_lower_value(arg_bc_lower_value), + bc_upper_value(arg_bc_upper_value), + bc_lower_limit(std::numeric_limits::epsilon()), + bc_upper_limit(scalar_coord_type(1) - + std::numeric_limits::epsilon()), + bc_plane(arg_bc_plane), + node_count(arg_mesh.node_count_owned()), + init(false) { + parallel_for(node_count, *this); + init = true; } + void apply() const { parallel_for(node_count, *this); } + //------------------------------------ KOKKOS_INLINE_FUNCTION - void operator()( const unsigned inode ) const - { + void operator()(const unsigned inode) const { // Apply dirichlet boundary condition on the Solution and Residual vectors. // To maintain the symmetry of the original global stiffness matrix, // zero out the columns that correspond to boundary conditions, and // update the residual vector accordingly const unsigned iBeg = jacobian.graph.row_map[inode]; - const unsigned iEnd = jacobian.graph.row_map[inode+1]; + const unsigned iEnd = jacobian.graph.row_map[inode + 1]; - const scalar_coord_type c = node_coords(inode,bc_plane); - const bool bc_lower = c <= bc_lower_limit ; - const bool bc_upper = bc_upper_limit <= c ; + const scalar_coord_type c = node_coords(inode, bc_plane); + const bool bc_lower = c <= bc_lower_limit; + const bool bc_upper = bc_upper_limit <= c; - if ( ! init ) { - solution(inode) = bc_lower ? bc_lower_value : ( - bc_upper ? bc_upper_value : 0 ); - } - else { - if ( bc_lower || bc_upper ) { - - residual(inode) = 0 ; + if (!init) { + solution(inode) = + bc_lower ? bc_lower_value : (bc_upper ? bc_upper_value : 0); + } else { + if (bc_lower || bc_upper) { + residual(inode) = 0; // zero each value on the row, and leave a one // on the diagonal - for( unsigned i = iBeg ; i < iEnd ; ++i ) { - jacobian.values(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ; + for (unsigned i = iBeg; i < iEnd; ++i) { + jacobian.values(i) = + int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0; } - } - else { - + } else { // Find any columns that are boundary conditions. // Clear them and adjust the residual vector - for( unsigned i = iBeg ; i < iEnd ; ++i ) { - const unsigned cnode = jacobian.graph.entries(i) ; - const scalar_coord_type cc = node_coords(cnode,bc_plane); + for (unsigned i = iBeg; i < iEnd; ++i) { + const unsigned cnode = jacobian.graph.entries(i); + const scalar_coord_type cc = node_coords(cnode, bc_plane); - if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) { - jacobian.values(i) = 0 ; + if ((cc <= bc_lower_limit) || (bc_upper_limit <= cc)) { + jacobian.values(i) = 0; } } } @@ -1139,11 +1116,10 @@ class DirichletComputation< //---------------------------------------------------------------------------- /* A Cuda-specific specialization for the element computation functor. */ -#if defined( __CUDACC__ ) +#if defined(__CUDACC__) // #include #endif //---------------------------------------------------------------------------- #endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */ - From 45e919ca8dc6b057a2e51d6eb58495649893f7ce Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Fri, 8 Apr 2022 18:46:34 -0600 Subject: [PATCH 219/261] Remove join(volatile) overloads where join() taking non-volatile parameters exists --- src/blas/impl/KokkosBlas1_dot_impl.hpp | 5 ----- src/blas/impl/KokkosBlas1_iamax_impl.hpp | 7 ------- src/blas/impl/KokkosBlas1_nrm2_impl.hpp | 5 ----- src/blas/impl/KokkosBlas1_nrm2w_impl.hpp | 5 ----- 4 files changed, 22 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_dot_impl.hpp b/src/blas/impl/KokkosBlas1_dot_impl.hpp index cb8db757f8..b153b3ed72 100644 --- a/src/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_impl.hpp @@ -91,11 +91,6 @@ struct DotFunctor { const value_type& source) const { update += source; } - - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } }; } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_iamax_impl.hpp b/src/blas/impl/KokkosBlas1_iamax_impl.hpp index dc30edf7da..8b27b3e5a3 100644 --- a/src/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/src/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -96,13 +96,6 @@ struct V_Iamax_Functor { update = Kokkos::reduction_identity::max() + 1; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - mag_type source_val = IPT::norm(m_x(source - 1)); - mag_type update_val = IPT::norm(m_x(update - 1)); - if (update_val < source_val) update = source; - } - KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { mag_type source_val = IPT::norm(m_x(source - 1)); diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index f2b0e826bc..e56a884655 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -105,11 +105,6 @@ struct V_Nrm2_Functor { update += source; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } - KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 3f202ca430..e2c858f0b3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -108,11 +108,6 @@ struct V_Nrm2w_Functor { update += source; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } - KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = From 2f60e260571b10f664adc4587aad66317db8fddc Mon Sep 17 00:00:00 2001 From: Phil Miller Date: Tue, 17 May 2022 14:27:53 -0700 Subject: [PATCH 220/261] Drop remaining uses of volatile in reducer join method signatures --- example/fenl/TestFixture.hpp | 3 +-- example/fenl/fenl_functors.hpp | 8 ++------ perf_test/graph/KokkosGraph_run_triangle.hpp | 4 +--- src/batched/KokkosBatched_Util.hpp | 4 +--- src/blas/impl/KokkosBlas1_dot_impl.hpp | 2 +- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 3 +-- src/common/KokkosKernels_SimpleUtils.hpp | 2 +- src/common/KokkosKernels_Utils.hpp | 15 ++++----------- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 4 ++-- src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp | 2 +- .../impl/KokkosSparse_spgemm_impl_symbolic.hpp | 6 +++--- unit_test/common/Test_Common_ArithTraits.hpp | 4 ++-- 12 files changed, 20 insertions(+), 37 deletions(-) diff --git a/example/fenl/TestFixture.hpp b/example/fenl/TestFixture.hpp index 7c09752433..54b841c4b6 100644 --- a/example/fenl/TestFixture.hpp +++ b/example/fenl/TestFixture.hpp @@ -74,8 +74,7 @@ struct FixtureVerifyElemNodeCoord { void init(value_type& update) const { update.success = update.error = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile value_type& update, - volatile const value_type& input) const { + void join(value_type& update, const value_type& input) const { update.success += input.success; update.error += input.error; } diff --git a/example/fenl/fenl_functors.hpp b/example/fenl/fenl_functors.hpp index 5706497db2..0a489fa1c0 100644 --- a/example/fenl/fenl_functors.hpp +++ b/example/fenl/fenl_functors.hpp @@ -364,9 +364,7 @@ class NodeNodeGraph { void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile unsigned& update, const volatile unsigned& input) const { - update += input; - } + void join(unsigned& update, const unsigned& input) const { update += input; } //------------------------------------ }; @@ -642,9 +640,7 @@ class NodeElemGatherFill { void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile unsigned& update, const volatile unsigned& input) const { - update += input; - } + void join(unsigned& update, const unsigned& input) const { update += input; } }; } /* namespace FENL */ diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 2fee139a64..0a189cd3e1 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -117,9 +117,7 @@ struct Flush { void init(value_type &update) { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &update, const volatile value_type &input) { - update += input; - } + void join(value_type &update, const value_type &input) { update += input; } KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &update) const { update += _buf[i]; } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 338c3fe8f8..46b97ee039 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -123,9 +123,7 @@ struct Flush { void init(value_type &update) { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &update, const volatile value_type &input) { - update += input; - } + void join(value_type &update, const value_type &input) { update += input; } KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &update) const { update += _buf[i]; } diff --git a/src/blas/impl/KokkosBlas1_dot_impl.hpp b/src/blas/impl/KokkosBlas1_dot_impl.hpp index b153b3ed72..5430e0177b 100644 --- a/src/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_impl.hpp @@ -83,7 +83,7 @@ struct DotFunctor { Kokkos::Details::updateDot(sum, m_x(i), m_y(i)); // sum += m_x(i) * m_y(i) } - KOKKOS_INLINE_FUNCTION void init(volatile value_type& update) const { + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = Kokkos::Details::ArithTraits::zero(); } diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index a16a9eaf9a..a6c8111684 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -190,8 +190,7 @@ struct SingleLevelTransposeGEMV { } } - KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, - const volatile value_type src) const { + KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const { for (IndexType j = 0; j < value_count; ++j) { dst[j] += src[j]; } diff --git a/src/common/KokkosKernels_SimpleUtils.hpp b/src/common/KokkosKernels_SimpleUtils.hpp index c1f68ebd3b..bb2a6d43b9 100644 --- a/src/common/KokkosKernels_SimpleUtils.hpp +++ b/src/common/KokkosKernels_SimpleUtils.hpp @@ -346,7 +346,7 @@ struct ReduceMaxFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &dst, const volatile value_type &src) const { + void join(value_type &dst, const value_type &src) const { if (dst < src) { dst = src; } diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index bf881edc6f..eae4080879 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -515,7 +515,7 @@ struct PropogataMaxValstoZeros { } KOKKOS_INLINE_FUNCTION - void join(volatile idx &update, volatile const idx &input) const { + void join(idx &update, const idx &input) const { if (input > update) update = input; } }; @@ -1260,7 +1260,7 @@ struct ReduceRowSizeFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile size_type &dst, const volatile size_type &src) const { + void join(size_type &dst, const size_type &src) const { if (dst < src) { dst = src; } @@ -1305,7 +1305,7 @@ struct ReduceMaxRowFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &dst, const volatile value_type &src) const { + void join(value_type &dst, const value_type &src) const { if (dst < src) { dst = src; } @@ -1350,9 +1350,7 @@ struct IsEqualFunctor { } KOKKOS_INLINE_FUNCTION - void join(volatile int &dst, const volatile int &src) const { - dst = dst & src; - } + void join(int &dst, const int &src) const { dst = dst & src; } KOKKOS_INLINE_FUNCTION void init(int &dst) const { dst = 1; } }; @@ -1466,11 +1464,6 @@ struct array_sum_reduce { for (int i = 0; i < N; i++) data[i] += src.data[i]; return *this; } - KOKKOS_INLINE_FUNCTION // volatile add operator - void - operator+=(const volatile ValueType &src) volatile { - for (int i = 0; i < N; i++) data[i] += src.data[i]; - } }; template diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 7f04bfa94f..0f5d60591f 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -560,9 +560,9 @@ class GraphColoringHandle { if (color_max < colors(i)) color_max = colors(i); } + // max-plus semiring equivalent of "plus" KOKKOS_INLINE_FUNCTION - void join(volatile color_t &dst, const volatile color_t &src) - const { // max -plus semiring equivalent of "plus" + void join(color_t &dst, const color_t &src) const { if (dst < src) { dst = src; } diff --git a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp index e566e8bf06..c6a24e2163 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp @@ -509,7 +509,7 @@ struct KokkosSPGEMM Date: Tue, 28 Jun 2022 23:29:13 +0200 Subject: [PATCH 221/261] Move {Serial,Team}{Set,Scale} unit tests from KokkosBatched to KokkosBlas --- .../batched/dense/Test_Batched_Dense.hpp | 6 - .../dense/Test_Batched_SerialMatUtil.hpp | 165 ----------- .../Test_Batched_SerialMatUtil_Complex.hpp | 19 -- .../dense/Test_Batched_SerialMatUtil_Real.hpp | 18 -- .../dense/Test_Batched_TeamMatUtil.hpp | 178 ------------ .../Test_Batched_TeamMatUtil_Complex.hpp | 19 -- .../dense/Test_Batched_TeamMatUtil_Real.hpp | 21 -- unit_test/blas/Test_Blas.hpp | 4 + unit_test/blas/Test_Blas1_serial_setscal.hpp | 246 +++++++++++++++++ unit_test/blas/Test_Blas1_team_setscal.hpp | 259 ++++++++++++++++++ 10 files changed, 509 insertions(+), 426 deletions(-) delete mode 100644 unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp delete mode 100644 unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp delete mode 100644 unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp delete mode 100644 unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp delete mode 100644 unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp delete mode 100644 unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp create mode 100644 unit_test/blas/Test_Blas1_serial_setscal.hpp create mode 100644 unit_test/blas/Test_Blas1_team_setscal.hpp diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp index 57de7ebfdd..edf573c633 100644 --- a/unit_test/batched/dense/Test_Batched_Dense.hpp +++ b/unit_test/batched/dense/Test_Batched_Dense.hpp @@ -24,9 +24,6 @@ #include "Test_Batched_SerialLU.hpp" #include "Test_Batched_SerialLU_Real.hpp" #include "Test_Batched_SerialLU_Complex.hpp" -#include "Test_Batched_SerialMatUtil.hpp" -#include "Test_Batched_SerialMatUtil_Real.hpp" -#include "Test_Batched_SerialMatUtil_Complex.hpp" #include "Test_Batched_SerialSolveLU.hpp" #include "Test_Batched_SerialSolveLU_Real.hpp" #include "Test_Batched_SerialSolveLU_Complex.hpp" @@ -62,9 +59,6 @@ #include "Test_Batched_TeamLU.hpp" #include "Test_Batched_TeamLU_Real.hpp" #include "Test_Batched_TeamLU_Complex.hpp" -#include "Test_Batched_TeamMatUtil.hpp" -#include "Test_Batched_TeamMatUtil_Real.hpp" -#include "Test_Batched_TeamMatUtil_Complex.hpp" #include "Test_Batched_TeamSolveLU.hpp" #include "Test_Batched_TeamSolveLU_Real.hpp" #include "Test_Batched_TeamSolveLU_Complex.hpp" diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp deleted file mode 100644 index 56939beb87..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp +++ /dev/null @@ -1,165 +0,0 @@ -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "gtest/gtest.h" -#include "Kokkos_Core.hpp" -#include "Kokkos_Random.hpp" - -#include "KokkosBlas1_set.hpp" - -// TODO: move this test to KokkosBlas when both SerialScale and SerialSet are -// moved -#include "KokkosBlas1_scal.hpp" // #include "KokkosBatched_Scale_Decl.hpp" - -#include "KokkosKernels_TestUtils.hpp" - -using namespace KokkosBatched; - -namespace Test { - -enum : int { BatchedSet = 0, BatchedScale = 1 }; - -struct KokkosKernelTag {}; -struct NaiveTag {}; - -template -struct Functor_TestBatchedSerialMatUtil { - ScalarType _alpha; - ViewType _a; - - KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const KokkosKernelTag &, const int i) const { - auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); - switch (TestID) { - case BatchedSet: KokkosBlas::SerialSet::invoke(_alpha, A); break; - case BatchedScale: KokkosBlas::SerialScale::invoke(_alpha, A); break; - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const NaiveTag &, const int k) const { - // MD Note: changing because of the error with -werror - auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - const int m = A.extent(0), n = A.extent(1); - switch (TestID) { - case BatchedSet: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) = _alpha; - break; - } - case BatchedScale: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) *= _alpha; - break; - } - } - } - - inline int run() { - typedef typename ViewType::value_type value_type; - std::string name_region("KokkosBatched::Test::SerialMatUtil"); - const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBatched" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BatchedSet - ? "Set" - : TestID == BatchedScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; - Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(name.c_str(), policy, *this); - Kokkos::Profiling::popRegion(); - return 0; - } -}; - -template -void impl_test_batched_matutil(const int N, const int BlkSize) { - /// typedefs - typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; - - /// radomized input testing views - const ScalarType alpha = 11.1; - ViewType a("a", N, BlkSize, BlkSize); - ViewType b("b", N, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random( - 13718); - Kokkos::fill_random(a, random, value_type(1.0)); - - Kokkos::fence(); - - Kokkos::deep_copy(b, a); - - /// test body - Functor_TestBatchedSerialMatUtil(alpha, a) - .run(); - Functor_TestBatchedSerialMatUtil(alpha, b) - .run(); - - Kokkos::fence(); - - /// for comparison send it to host - typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); - typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); - - Kokkos::deep_copy(a_host, a); - Kokkos::deep_copy(b_host, b); - - /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); -} -} // namespace Test - -template -int test_batched_matutil() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - { - typedef Kokkos::View - ViewType; - Test::impl_test_batched_matutil( - 0, 10); - Test::impl_test_batched_matutil( - 10, 15); - Test::impl_test_batched_matutil( - 1024, 9); - Test::impl_test_batched_matutil( - 132231, 3); - } -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - { - typedef Kokkos::View - ViewType; - Test::impl_test_batched_matutil( - 0, 10); - Test::impl_test_batched_matutil( - 10, 15); - Test::impl_test_batched_matutil( - 1024, 9); - Test::impl_test_batched_matutil( - 132231, 3); - } -#endif - - return 0; -} diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp deleted file mode 100644 index 055a0cae62..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp +++ /dev/null @@ -1,19 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_dcomplex) { - test_batched_matutil, - Kokkos::complex, ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_dcomplex) { - test_batched_matutil, - Kokkos::complex, ::Test::BatchedScale>(); -} -TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_double) { - test_batched_matutil, double, - ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_double) { - test_batched_matutil, double, - ::Test::BatchedScale>(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp deleted file mode 100644 index c1644f9798..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp +++ /dev/null @@ -1,18 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_set_float_float) { - test_batched_matutil(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_float_float) { - test_batched_matutil(); -} -#endif - -#if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_set_double_double) { - test_batched_matutil(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_double_double) { - test_batched_matutil(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp deleted file mode 100644 index 8a3c9939bf..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp +++ /dev/null @@ -1,178 +0,0 @@ -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "gtest/gtest.h" -#include "Kokkos_Core.hpp" -#include "Kokkos_Random.hpp" - -#include "KokkosBlas1_set.hpp" - -// #include "KokkosBatched_Scale_Decl.hpp" - -#include "KokkosKernels_TestUtils.hpp" - -using namespace KokkosBatched; - -namespace Test { -namespace TeamMatUtil { - -enum : int { BatchedSet = 0, BatchedScale = 1 }; - -struct KokkosKernelTag {}; -struct NaiveTag {}; - -template -struct Functor_TestBatchedTeamMatUtil { - ScalarType _alpha; - ViewType _a; - - KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, - const MemberType &member) const { - const int i = member.league_rank(); - auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); - switch (TestID) { - case BatchedSet: - KokkosBlas::TeamSet::invoke(member, _alpha, A); - break; - case BatchedScale: - KokkosBlas::TeamScale::invoke(member, _alpha, A); - break; - } - } - - template - KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, - const MemberType &member) const { - if (member.team_rank() == 0) { - const int k = member.league_rank(); - auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - const int m = A.extent(0), n = A.extent(1); - switch (TestID) { - case BatchedSet: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) = _alpha; - break; - } - case BatchedScale: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) *= _alpha; - break; - } - } - } - } - - inline int run() { - typedef typename ViewType::value_type value_type; - std::string name_region("KokkosBatched::Test::SerialMatUtil"); - const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBatched" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BatchedSet - ? "Set" - : TestID == BatchedScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; - Kokkos::Profiling::pushRegion(name.c_str()); - - const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); - Kokkos::parallel_for(name.c_str(), policy, *this); - Kokkos::Profiling::popRegion(); - - return 0; - } -}; - -template -void impl_test_batched_matutil(const int N, const int BlkSize) { - /// typedefs - typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; - - /// radomized input testing views - const ScalarType alpha = 11.1; - ViewType a("a", N, BlkSize, BlkSize); - ViewType b("b", N, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random( - 13718); - Kokkos::fill_random(a, random, value_type(1.0)); - - Kokkos::fence(); - - Kokkos::deep_copy(b, a); - - /// test body - Functor_TestBatchedTeamMatUtil(alpha, a) - .run(); - Functor_TestBatchedTeamMatUtil(alpha, b) - .run(); - - Kokkos::fence(); - - /// for comparison send it to host - typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); - typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); - - Kokkos::deep_copy(a_host, a); - Kokkos::deep_copy(b_host, b); - - /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); -} -} // namespace TeamMatUtil -} // namespace Test - -template -int test_batched_team_matutil() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_batched_matutil(0, 10); - Test::TeamMatUtil::impl_test_batched_matutil(10, 15); - Test::TeamMatUtil::impl_test_batched_matutil(1024, 9); - Test::TeamMatUtil::impl_test_batched_matutil(132231, 3); - } -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_batched_matutil(0, 10); - Test::TeamMatUtil::impl_test_batched_matutil(10, 15); - Test::TeamMatUtil::impl_test_batched_matutil(1024, 9); - Test::TeamMatUtil::impl_test_batched_matutil(132231, 3); - } -#endif - - return 0; -} diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp deleted file mode 100644 index 7f573354d8..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp +++ /dev/null @@ -1,19 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_set_dcomplex_dcomplex) { - test_batched_team_matutil, - Kokkos::complex, ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_dcomplex) { - test_batched_team_matutil, - Kokkos::complex, ::Test::BatchedScale>(); -} -TEST_F(TestCategory, batched_scalar_team_set_dcomplex_double) { - test_batched_team_matutil, double, - ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_double) { - test_batched_team_matutil, double, - ::Test::BatchedScale>(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp deleted file mode 100644 index 1f13b79cca..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp +++ /dev/null @@ -1,21 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_set_float_float) { - test_batched_team_matutil(); -} -TEST_F(TestCategory, batched_scalar_team_scale_float_float) { - test_batched_team_matutil(); -} -#endif - -#if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_set_double_double) { - test_batched_team_matutil(); -} -TEST_F(TestCategory, batched_scalar_team_scale_double_double) { - test_batched_team_matutil(); -} -#endif diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 16d54e3dce..42b1050c40 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -23,7 +23,11 @@ #include "Test_Blas1_sum.hpp" #include "Test_Blas1_update.hpp" +// Serial Blas 1 +#include "Test_Blas1_serial_setscal.hpp" + // Team Blas 1 +#include "Test_Blas1_team_setscal.hpp" #include "Test_Blas1_team_abs.hpp" #include "Test_Blas1_team_axpby.hpp" #include "Test_Blas1_team_axpy.hpp" diff --git a/unit_test/blas/Test_Blas1_serial_setscal.hpp b/unit_test/blas/Test_Blas1_serial_setscal.hpp new file mode 100644 index 0000000000..2e2a207c47 --- /dev/null +++ b/unit_test/blas/Test_Blas1_serial_setscal.hpp @@ -0,0 +1,246 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBlas1_set.hpp" +#include "KokkosBlas1_scal.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBlas; + +namespace Test { + +enum : int { BlasSet = 0, BlasScale = 1 }; + +struct KokkosKernelTag {}; +struct NaiveTag {}; + +template +struct Functor_TestBlasSerialMatUtil { + ScalarType _alpha; + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) + : _alpha(alpha), _a(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); + switch (TestID) { + case BlasSet: KokkosBlas::SerialSet::invoke(_alpha, A); break; + case BlasScale: KokkosBlas::SerialScale::invoke(_alpha, A); break; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + // MD Note: changing because of the error with -werror + auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = A.extent(0), n = A.extent(1); + switch (TestID) { + case BlasSet: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) = _alpha; + break; + } + case BlasScale: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) *= _alpha; + break; + } + } + } + + inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBlas::Test::SerialMatUtil"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = + (TestID == BlasSet ? "Set" + : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return 0; + } +}; + +template +void impl_test_blas_matutil(const int N, const int BlkSize) { + /// typedefs + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType a("a", N, BlkSize, BlkSize); + ViewType b("b", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(a, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(b, a); + + /// test body + Functor_TestBlasSerialMatUtil(alpha, a) + .run(); + Functor_TestBlasSerialMatUtil(alpha, b) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + + Kokkos::deep_copy(a_host, a); + Kokkos::deep_copy(b_host, b); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); +} +} // namespace Test + +template +int test_blas_matutil() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_matutil(0, + 10); + Test::impl_test_blas_matutil(10, + 15); + Test::impl_test_blas_matutil(1024, + 9); + Test::impl_test_blas_matutil( + 132231, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_matutil(0, + 10); + Test::impl_test_blas_matutil(10, + 15); + Test::impl_test_blas_matutil(1024, + 9); + Test::impl_test_blas_matutil( + 132231, 3); + } +#endif + + return 0; +} + +// Real test cases + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, blas_scalar_serial_set_float_float) { + test_blas_matutil(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_float_float) { + test_blas_matutil(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, blas_scalar_serial_set_double_double) { + test_blas_matutil(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { + test_blas_matutil(); +} +#endif + +// Complex test cases + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { + test_blas_matutil, + Kokkos::complex, ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { + test_blas_matutil, + Kokkos::complex, ::Test::BlasScale>(); +} +TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { + test_blas_matutil, double, + ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { + test_blas_matutil, double, + ::Test::BlasScale>(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_team_setscal.hpp b/unit_test/blas/Test_Blas1_team_setscal.hpp new file mode 100644 index 0000000000..394c7b6c2d --- /dev/null +++ b/unit_test/blas/Test_Blas1_team_setscal.hpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBlas1_set.hpp" +#include "KokkosBlas1_scal.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +namespace TeamMatUtil { + +enum : int { BlasSet = 0, BlasScale = 1 }; + +struct KokkosKernelTag {}; +struct NaiveTag {}; + +template +struct Functor_TestBlasTeamMatUtil { + ScalarType _alpha; + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) + : _alpha(alpha), _a(a) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, + const MemberType &member) const { + const int i = member.league_rank(); + auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); + switch (TestID) { + case BlasSet: + KokkosBlas::TeamSet::invoke(member, _alpha, A); + break; + case BlasScale: + KokkosBlas::TeamScale::invoke(member, _alpha, A); + break; + } + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, + const MemberType &member) const { + if (member.team_rank() == 0) { + const int k = member.league_rank(); + auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = A.extent(0), n = A.extent(1); + switch (TestID) { + case BlasSet: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) = _alpha; + break; + } + case BlasScale: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) *= _alpha; + break; + } + } + } + } + + inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBlas::Test::SerialMatUtil"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = + (TestID == BlasSet ? "Set" + : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + + const int league_size = _a.extent(0); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + + return 0; + } +}; + +template +void impl_test_blas_matutil(const int N, const int BlkSize) { + /// typedefs + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType a("a", N, BlkSize, BlkSize); + ViewType b("b", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(a, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(b, a); + + /// test body + Functor_TestBlasTeamMatUtil(alpha, a) + .run(); + Functor_TestBlasTeamMatUtil(alpha, b) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + + Kokkos::deep_copy(a_host, a); + Kokkos::deep_copy(b_host, b); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); +} +} // namespace TeamMatUtil +} // namespace Test + +template +int test_blas_team_matutil() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + } +#endif + + return 0; +} + +// Real test cases + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, blas_scalar_team_set_float_float) { + test_blas_team_matutil(); +} +TEST_F(TestCategory, blas_scalar_team_scale_float_float) { + test_blas_team_matutil(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, blas_scalar_team_set_double_double) { + test_blas_team_matutil(); +} +TEST_F(TestCategory, blas_scalar_team_scale_double_double) { + test_blas_team_matutil(); +} +#endif + +// Complex test cases + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { + test_blas_team_matutil, + Kokkos::complex, ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { + test_blas_team_matutil, + Kokkos::complex, ::Test::BlasScale>(); +} +TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { + test_blas_team_matutil, double, + ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { + test_blas_team_matutil, double, + ::Test::BlasScale>(); +} +#endif From e397f2b3b6ec98c3ff85036d4392049f2212440f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Thu, 16 Jun 2022 00:14:52 +0200 Subject: [PATCH 222/261] Fix incorrect function tested --- .../dense/Test_Batched_TeamGemv_Complex.hpp | 22 ++++++++++--------- .../dense/Test_Batched_TeamGemv_Real.hpp | 16 +++++++------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp index cdcd00cff2..3ffc34db23 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp @@ -5,19 +5,21 @@ TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_team_gemv, + Kokkos::complex, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_team_gemv, + Kokkos::complex, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_dcomplex ) { // typedef ::Test::TeamGemv::ParamTag param_tag_type; // typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_gemv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_gemv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -25,19 +27,19 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) { TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, double, - param_tag_type, algo_tag_type>(); + test_batched_team_gemv, double, + param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, double, - param_tag_type, algo_tag_type>(); + test_batched_team_gemv, double, + param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_double ) { // typedef ::Test::TeamGemv::ParamTag param_tag_type; // typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_gemv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_gemv,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp index 8401075f47..2c4db11b2d 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp @@ -3,14 +3,14 @@ TEST_F(TestCategory, batched_scalar_team_gemv_nt_float_float) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } #endif @@ -18,13 +18,13 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) { TEST_F(TestCategory, batched_scalar_team_gemv_nt_double_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_double_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } #endif From 9f3c4bc8b36f0bf9b7f23dde55f37318f5de11b1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 14 Jul 2022 09:09:42 -0600 Subject: [PATCH 223/261] KokkosSparse: applying clang format to Utils --- .../sparse/KokkosSparse_sptrsv_supernode.cpp | 28 +++++++------- .../sparse/Test_Sparse_block_gauss_seidel.hpp | 37 +++++++++---------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index b7eb39d68e..b77f0b1d07 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -208,24 +208,24 @@ int test_sptrsv_perf(std::vector tests, bool verbose, if (test == SUPERNODAL_NAIVE) { std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl; - khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, - true); - khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, - true); + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, + nrows, true); + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, + nrows, true); } else if (test == SUPERNODAL_DAG) { std::cout << " > create handle for SUPERNODAL_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - true); - khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - true); + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, + nrows, true); + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, + nrows, true); } else if (test == SUPERNODAL_SPMV_DAG) { std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, - nrows, true); - khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, - nrows, true); + khL.create_sptrsv_handle( + KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); + khU.create_sptrsv_handle( + KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); } // verbose (optional, default is false) khL.set_sptrsv_verbose(verbose); @@ -250,8 +250,8 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // graph/dag) khU.get_sptrsv_handle()->set_column_major( !khL.get_sptrsv_handle()->is_column_major()); - KSExp::sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph, - &khL, L.graph, &khU); + KSExp::sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, + L.graph, &khL, L.graph, &khU); // ============================================== // do numeric compute (copy numerical values from SuperLU data diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index b0c57ccf7e..51e0899529 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -82,8 +82,9 @@ struct GSTestParams { // Note: GS_DEFAULT is same as GS_TEAM and - for blocks - as GS_PERMUTED // Note: GS_TWOSTAGE and GS_CLUSTER are not supported for blocks - std::vector gs_algorithms = {KokkosSparse::GS_DEFAULT}; - std::vector shmem_sizes = { + std::vector gs_algorithms = { + KokkosSparse::GS_DEFAULT}; + std::vector shmem_sizes = { 32128, 2008 // make the shmem small on gpus so that it will test 2 level // algorithm. @@ -127,9 +128,9 @@ int run_block_gauss_seidel_1( const int apply_count = 100; if (!skip_symbolic) { - KSExp::block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size, - input_mat.graph.row_map, - input_mat.graph.entries, is_symmetric_graph); + KSExp::block_gauss_seidel_symbolic( + &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, + input_mat.graph.entries, is_symmetric_graph); } if (!skip_numeric) { @@ -172,8 +173,8 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - using crsMat_t = - typename KokkosSparse::CrsMatrix; + using crsMat_t = typename KokkosSparse::CrsMatrix; using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; @@ -209,9 +210,8 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2); // this converts the previous generated matrix to block matrix. - auto input_mat = - MatrixConverter::from_blockcrs_formatted_point_crsmatrix( - crsmat2, block_size); + auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix( + crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -259,8 +259,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - using crsMat_t = - typename KokkosSparse::CrsMatrix; + using crsMat_t = typename KokkosSparse::CrsMatrix; using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; @@ -296,9 +296,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, graph_t static_graph2(pf_e, pf_rm); crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2); - auto input_mat = - MatrixConverter::from_blockcrs_formatted_point_crsmatrix( - crsmat2, block_size); + auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix( + crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -392,10 +391,10 @@ void test_block_gauss_seidel_empty() { entries_type entries("Entries", 0); scalar_view_t values("Values", 0); // also, make sure graph symmetrization doesn't crash on zero rows - KSExp::block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap, - entries, false); - KSExp::block_gauss_seidel_numeric(&kh, num_rows, num_rows, block_size, - rowmap, entries, values, false); + KSExp::block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, + rowmap, entries, false); + KSExp::block_gauss_seidel_numeric( + &kh, num_rows, num_rows, block_size, rowmap, entries, values, false); scalar_view_t x("X", num_rows); scalar_view_t y("Y", num_rows); scalar_t omega(0.9); From 8a4a634477ecbd1dac5616144abaadaecc0a2a9b Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 14 Jul 2022 11:53:45 -0600 Subject: [PATCH 224/261] ArithTraits: fix issue with sign change warning The logic to decide how the abs and nan functions are implemented based on the signedness of val_type should be more robust now and will prevent the compiler warning. It also removes the macro parameters that were used until now! --- src/common/Kokkos_ArithTraits.hpp | 78 +++++++++++++++---------------- 1 file changed, 38 insertions(+), 40 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index d6271f9b4e..7ffaa53e02 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -413,22 +413,27 @@ namespace Details { static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -#define KOKKOSKERNELS_SIGNED_ABS \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ - return Kokkos::abs(x); \ - } - -#define KOKKOSKERNELS_UNSIGNED_ABS \ - static KOKKOS_FUNCTION mag_type abs(const val_type x) { return x; } - -#define KOKKOSKERNELS_SIGNED_NAN \ - static KOKKOS_FUNCTION val_type nan() { return -1; } - -#define KOKKOSKERNELS_UNSIGNED_NAN \ - static KOKKOS_FUNCTION val_type nan() { return max(); } - -#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_ABS, \ - KOKKOSKERNELS_NAN) \ +template +static KOKKOS_FUNCTION +typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return Kokkos::abs(x); } + +template +static KOKKOS_FUNCTION +typename std::enable_if::is_signed, val_type>::type +KokkosKernelsAbs(const val_type x) { return x; } + +template +static KOKKOS_FUNCTION +typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return -1; } + +template +static KOKKOS_FUNCTION +typename std::enable_if::is_signed, val_type>::type +KokkosKernelsNan() { return max(); } + +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ \ static constexpr bool is_specialized = true; \ static constexpr bool is_integer = true; \ @@ -456,10 +461,14 @@ namespace Details { static KOKKOS_FUNCTION val_type infinity() { \ return static_cast(0); \ } \ - KOKKOSKERNELS_NAN \ + static KOKKOS_FUNCTION val_type nan() { \ + return KokkosKernelsNan(); \ + } \ static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ - KOKKOSKERNELS_ABS \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ + return KokkosKernelsAbs(x); \ + } \ static KOKKOS_FUNCTION mag_type real(const val_type x) { \ return Kokkos::real(x); \ } \ @@ -1659,8 +1668,7 @@ class ArithTraits { static std::string name() { return "char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1673,8 +1681,7 @@ class ArithTraits { static std::string name() { return "signed char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1687,8 +1694,7 @@ class ArithTraits { static std::string name() { return "unsigned char"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, - KOKKOSKERNELS_UNSIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1701,8 +1707,7 @@ class ArithTraits { static std::string name() { return "short"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1715,8 +1720,7 @@ class ArithTraits { static std::string name() { return "unsigned short"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, - KOKKOSKERNELS_UNSIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1729,8 +1733,7 @@ class ArithTraits { static std::string name() { return "int"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1743,8 +1746,7 @@ class ArithTraits { static std::string name() { return "unsigned int"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, - KOKKOSKERNELS_UNSIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1757,8 +1759,7 @@ class ArithTraits { static std::string name() { return "long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1771,8 +1772,7 @@ class ArithTraits { static std::string name() { return "unsigned long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, - KOKKOSKERNELS_UNSIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1785,8 +1785,7 @@ class ArithTraits { static std::string name() { return "long long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_SIGNED_ABS, - KOKKOSKERNELS_SIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; template <> @@ -1799,8 +1798,7 @@ class ArithTraits { static std::string name() { return "unsigned long long"; } - KOKKOSKERNELS_ARITHTRAITS_INTEGRAL(KOKKOSKERNELS_UNSIGNED_ABS, - KOKKOSKERNELS_UNSIGNED_NAN) + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() }; // dd_real and qd_real are floating-point types provided by the QD From 81696dad16ab82ee454a01c68af0b6b51c9a0b3d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 14 Jul 2022 11:59:46 -0600 Subject: [PATCH 225/261] Fix -Werror Drop struct from creation of Kokkos::InitializationSettings type --- perf_test/sparse/KokkosSparse_block_pcg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 8e453b4d01..f6b3fd3a87 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -381,7 +381,7 @@ int main(int argc, char **argv) { int cmdline[CMD_COUNT]; char *mtx_bin_file = NULL; int block_size = 5; - struct Kokkos::InitializationSettings kargs; + Kokkos::InitializationSettings kargs; for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0; From e2710dcd89763b44e3377e03b2f4c5df31c74e65 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 14 Jul 2022 13:01:39 -0600 Subject: [PATCH 226/261] Fix build when double not instantiated Use default scalar (may be float or double) for the KernelHandle in graph coloring unit test. --- unit_test/graph/Test_Graph_graph_color.hpp | 9 +++++---- unit_test/graph/Test_Graph_graph_color_deterministic.hpp | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp index da86546862..c1203d9492 100644 --- a/unit_test/graph/Test_Graph_graph_color.hpp +++ b/unit_test/graph/Test_Graph_graph_color.hpp @@ -50,6 +50,7 @@ #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_default_types.hpp" using namespace KokkosKernels; using namespace KokkosKernels::Experimental; @@ -220,14 +221,14 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) #endif // FIXME_SYCL @@ -236,7 +237,7 @@ EXECUTE_TEST(double, int64_t, int, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) #endif #endif @@ -244,7 +245,7 @@ EXECUTE_TEST(double, int, size_t, TestExecSpace) defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) #endif #undef EXECUTE_TEST diff --git a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp index 2fd64675ec..e2e4a3d227 100644 --- a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp +++ b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp @@ -50,6 +50,7 @@ #include "KokkosKernels_IOUtils.hpp" #include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_default_types.hpp" using namespace KokkosKernels; using namespace KokkosKernels::Experimental; @@ -274,28 +275,28 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) #endif #undef EXECUTE_TEST From 479b337308cab4c0de6e92adc4771a147c538dd5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 14 Jul 2022 13:27:08 -0600 Subject: [PATCH 227/261] Shrink trsv test matrices - Make it pass with scalar=float, without increasing tolerance - Reduce running time of the test. The matrix generator always makes a dense triangle, so it's N^2 memory and spmv/solve time. --- unit_test/sparse/Test_Sparse_trsv.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index 938b040743..9a23f48883 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -104,16 +104,19 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); } +// Note BMK 7-22: the matrix generator used by this test always +// generates a dense triangle. It ignores bandwidth, nnz and row size variance. + #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ test_trsv_mv( \ - 5000, 5000 * 30, 200, 10, 1); \ + 1000, 1000 * 30, 200, 10, 1); \ test_trsv_mv( \ - 5000, 5000 * 30, 100, 10, 5); \ + 800, 800 * 30, 100, 10, 5); \ test_trsv_mv( \ - 1000, 1000 * 20, 100, 5, 10); \ + 400, 400 * 20, 100, 5, 10); \ } #if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ From b2cd5369c9a505cb6b4d7987313fc63e479136ca Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 14 Jul 2022 14:45:38 -0600 Subject: [PATCH 228/261] ArithTraits: fix undefined function error Let us just used the underlying Kokkos function for max(). --- src/common/Kokkos_ArithTraits.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 7ffaa53e02..4bd3748d3d 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -431,7 +431,7 @@ KokkosKernelsNan() { return -1; } template static KOKKOS_FUNCTION typename std::enable_if::is_signed, val_type>::type -KokkosKernelsNan() { return max(); } +KokkosKernelsNan() { return Kokkos::Experimental::finite_max::value; } #define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ \ From d82e7e097002168344a3a3f75949d12ccb89f096 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 14 Jul 2022 16:00:53 -0600 Subject: [PATCH 229/261] ArithTraits: applying clang-format --- src/common/Kokkos_ArithTraits.hpp | 36 ++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp index 4bd3748d3d..108e845694 100644 --- a/src/common/Kokkos_ArithTraits.hpp +++ b/src/common/Kokkos_ArithTraits.hpp @@ -413,25 +413,37 @@ namespace Details { static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ static FUNC_QUAL mag_type eps() { return epsilon(); } -template +template static KOKKOS_FUNCTION -typename std::enable_if::is_signed, val_type>::type -KokkosKernelsAbs(const val_type x) { return Kokkos::abs(x); } + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsAbs(const val_type x) { + return Kokkos::abs(x); +} -template +template static KOKKOS_FUNCTION -typename std::enable_if::is_signed, val_type>::type -KokkosKernelsAbs(const val_type x) { return x; } + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsAbs(const val_type x) { + return x; +} -template +template static KOKKOS_FUNCTION -typename std::enable_if::is_signed, val_type>::type -KokkosKernelsNan() { return -1; } + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsNan() { + return -1; +} -template +template static KOKKOS_FUNCTION -typename std::enable_if::is_signed, val_type>::type -KokkosKernelsNan() { return Kokkos::Experimental::finite_max::value; } + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsNan() { + return Kokkos::Experimental::finite_max::value; +} #define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ \ From b8b972ec4b08a721c1a51431d020b53572a6a92c Mon Sep 17 00:00:00 2001 From: Berger-Vergiat Date: Tue, 5 Jul 2022 08:16:01 -0600 Subject: [PATCH 230/261] blas dot/axpy: adding serial on device implementation These are going to be used for the Newton solver. They are only adding a serial implementation no team or team-vector variants. The implementation does support rank 1 and rank 2 views. Serial axpy/nrm2: adding examples --- src/blas/KokkosBlas1_axpby.hpp | 27 ++ src/blas/KokkosBlas1_nrm2.hpp | 58 +++++ src/blas/impl/KokkosBlas_serial_axpy.hpp | 88 +++++++ src/blas/impl/KokkosBlas_serial_nrm2.hpp | 92 +++++++ unit_test/blas/Test_Blas.hpp | 2 + unit_test/blas/Test_Blas_serial_axpy.hpp | 217 ++++++++++++++++ unit_test/blas/Test_Blas_serial_nrm2.hpp | 316 +++++++++++++++++++++++ 7 files changed, 800 insertions(+) create mode 100644 src/blas/impl/KokkosBlas_serial_axpy.hpp create mode 100644 src/blas/impl/KokkosBlas_serial_nrm2.hpp create mode 100644 unit_test/blas/Test_Blas_serial_axpy.hpp create mode 100644 unit_test/blas/Test_Blas_serial_nrm2.hpp diff --git a/src/blas/KokkosBlas1_axpby.hpp b/src/blas/KokkosBlas1_axpby.hpp index cae0cc7102..e8b79df565 100644 --- a/src/blas/KokkosBlas1_axpby.hpp +++ b/src/blas/KokkosBlas1_axpby.hpp @@ -46,6 +46,7 @@ #define KOKKOSBLAS1_AXPBY_HPP_ #include +#include #include #include @@ -124,6 +125,32 @@ void axpy(const AV& a, const XMV& X, const YMV& Y) { Y); } +/// +/// Serial axpy on device +/// +template +KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); + static_assert(XMV::Rank == 1 || XMV::Rank == 2, + "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); + static_assert( + XMV::Rank == YMV::Rank, + "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); + + if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { + Kokkos::abort("KokkosBlas::serial_axpy: X and Y dimensions do not match"); + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), + Y.data(), X.stride_0(), X.stride_1(), + Y.stride_0(), Y.stride_1()); +} + } // namespace KokkosBlas #endif diff --git a/src/blas/KokkosBlas1_nrm2.hpp b/src/blas/KokkosBlas1_nrm2.hpp index 3a10e48a4d..bbe231e795 100644 --- a/src/blas/KokkosBlas1_nrm2.hpp +++ b/src/blas/KokkosBlas1_nrm2.hpp @@ -46,6 +46,7 @@ #define KOKKOSBLAS1_NRM2_HPP_ #include +#include #include #include @@ -156,6 +157,63 @@ void nrm2(const RV& R, const XMV& X, Impl::Nrm2::nrm2(R_internal, X_internal, true); } + +/// +/// Serial nrm2 +/// +template +KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type +serial_nrm2(const XMV X) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(XMV::Rank == 1, + "KokkosBlas::serial_nrm2: XMV must have rank 1"); +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + return Impl::serial_nrm2(X.extent(0), X.data(), X.stride_0()); +} + +template +KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { +// Do some compile time check when debug is enabled +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); + static_assert(std::is_same::value, + "KokkosBlas::serial_nrm2: R is const. " + "It must be nonconst, because it is an output argument " + "(we have to be able to write to its entries)."); + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || + ((RV::rank == 1) && (XMV::rank == 2)), + "KokkosBlas::serial_nrm2: " + "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); + + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type; + static_assert( + std::is_same::value, + "KokkosBlas::serial_nrm2: RV must have same value_type as" + " Kokkos::ArithTraits::mag_type"); + + if (R.extent(0) != X.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," + " R: %d and X: %d x %d.\n", + R.extent_int(0), X.extent_int(0), X.extent_int(1)); + return 1; + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), + X.stride_1(), R.data(), R.stride_0()); + return 0; +} + } // namespace KokkosBlas #endif // KOKKOSBLAS1_NRM2_HPP_ diff --git a/src/blas/impl/KokkosBlas_serial_axpy.hpp b/src/blas/impl/KokkosBlas_serial_axpy.hpp new file mode 100644 index 0000000000..f9cc918650 --- /dev/null +++ b/src/blas/impl/KokkosBlas_serial_axpy.hpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_ +#define KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +template +KOKKOS_INLINE_FUNCTION static void serial_axpy( + const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) Y[i * ys0] += alpha * X[i * xs0]; + + return; +} + +template +KOKKOS_INLINE_FUNCTION static void serial_axpy_mv( + const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, + const int ys0, const int ys1) { + if (xs0 > xs1) { + for (int i = 0; i < m; ++i) + serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); + } else { + for (int j = 0; j < n; ++j) + serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); + } + + return; +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/src/blas/impl/KokkosBlas_serial_nrm2.hpp b/src/blas/impl/KokkosBlas_serial_nrm2.hpp new file mode 100644 index 0000000000..9397dc5020 --- /dev/null +++ b/src/blas/impl/KokkosBlas_serial_nrm2.hpp @@ -0,0 +1,92 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS_SERIAL_NRM2_HPP_ +#define KOKKOSBLAS_SERIAL_NRM2_HPP_ + +#include +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +template +KOKKOS_INLINE_FUNCTION static + typename Kokkos::Details::InnerProductSpaceTraits::mag_type + serial_nrm2(const int m, const ValueType *KOKKOS_RESTRICT X, + const int xs0) { + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + + norm_type nrm = Kokkos::ArithTraits::zero(); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) + nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); + + return Kokkos::ArithTraits::sqrt(nrm); +} + +template +KOKKOS_INLINE_FUNCTION static void serial_nrm2( + const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, + const int xs1, + typename Kokkos::Details::InnerProductSpaceTraits::mag_type + *KOKKOS_RESTRICT R, + const int ys0) { + for (int vecIdx = 0; vecIdx < n; ++vecIdx) + R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); + + return; +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS_SERIAL_NRM2_HPP_ diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 42b1050c40..77b5d14bc4 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -25,6 +25,8 @@ // Serial Blas 1 #include "Test_Blas1_serial_setscal.hpp" +#include "Test_Blas_serial_axpy.hpp" +#include "Test_Blas_serial_nrm2.hpp" // Team Blas 1 #include "Test_Blas1_team_setscal.hpp" diff --git a/unit_test/blas/Test_Blas_serial_axpy.hpp b/unit_test/blas/Test_Blas_serial_axpy.hpp new file mode 100644 index 0000000000..bd5dbcb5f6 --- /dev/null +++ b/unit_test/blas/Test_Blas_serial_axpy.hpp @@ -0,0 +1,217 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_BLAS_SERIAL_AXPY_HPP_ +#define TEST_BLAS_SERIAL_AXPY_HPP_ + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosBlas1_axpby.hpp" + +namespace Test { + +struct KokkosKernelAxpyTag {}; +struct NaiveAxpyTag {}; + +template +struct Functor_TestBlasSerialAxpy { + ScalarType _alpha; + ViewType _x; + ViewType _y; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, + const ViewType &y) + : _alpha(alpha), _x(x), _y(y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelAxpyTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL()); + auto Y = Kokkos::subview(_y, i, Kokkos::ALL(), Kokkos::ALL()); + KokkosBlas::serial_axpy(_alpha, X, Y); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveAxpyTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL()); + auto Y = Kokkos::subview(_y, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = X.extent(0), n = X.extent(1); + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) Y(i, j) += _alpha * X(i, j); + } + + inline void run() { + using value_type = typename ViewType::value_type; + std::string name_region("KokkosBlas::Test::SerialAxpy"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Axpy"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +void impl_test_blas_serial_axpy(const int N, const int BlkSize) { + /// typedefs + using value_type = typename ViewType::value_type; + using ats = Kokkos::ArithTraits; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType X("X", N, BlkSize, BlkSize); + ViewType Y("Y", N, BlkSize, BlkSize); + ViewType Yref("Yref", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fill_random(Y, random, ats::one()); + Kokkos::fence(); + Kokkos::deep_copy(Yref, Y); + + /// test body + Functor_TestBlasSerialAxpy( + alpha, X, Yref) + .run(); + Functor_TestBlasSerialAxpy( + alpha, X, Y) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror Y_host = Kokkos::create_mirror_view(Y); + typename ViewType::HostMirror Yref_host = Kokkos::create_mirror_view(Yref); + + Kokkos::deep_copy(Y_host, Y); + Kokkos::deep_copy(Yref_host, Yref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); +} + +} // namespace Test + +template +int test_blas_serial_axpy() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_serial_axpy(0, 10); + Test::impl_test_blas_serial_axpy(10, 15); + Test::impl_test_blas_serial_axpy(1024, 9); + Test::impl_test_blas_serial_axpy(132231, + 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_serial_axpy(0, 10); + Test::impl_test_blas_serial_axpy(10, 15); + Test::impl_test_blas_serial_axpy(1024, 9); + Test::impl_test_blas_serial_axpy(132231, + 3); + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_axpy_float_float) { + test_blas_serial_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_axpy_double_double) { + test_blas_serial_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { + test_blas_serial_axpy, + Kokkos::complex >(); +} + +TEST_F(TestCategory, serial_axpy_dcomplex_double) { + test_blas_serial_axpy, double>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { + test_blas_serial_axpy, + Kokkos::complex >(); +} + +TEST_F(TestCategory, serial_axpy_fcomplex_float) { + test_blas_serial_axpy, float>(); +} +#endif + +#endif // TEST_BLAS_SERIAL_AXPY_HPP_ diff --git a/unit_test/blas/Test_Blas_serial_nrm2.hpp b/unit_test/blas/Test_Blas_serial_nrm2.hpp new file mode 100644 index 0000000000..1a2721e782 --- /dev/null +++ b/unit_test/blas/Test_Blas_serial_nrm2.hpp @@ -0,0 +1,316 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_BLAS_SERIAL_NRM2_HPP_ +#define TEST_BLAS_SERIAL_NRM2_HPP_ + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosBlas1_nrm2.hpp" + +namespace Test { + +template +struct Functor_TestBlasSerialNrm2 { + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + ViewType _x; + norm_view_type _nrm; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) + : _x(x), _nrm(nrm) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL()); + _nrm(i) = KokkosBlas::serial_nrm2(X); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL()); + _nrm(k) = Kokkos::ArithTraits::zero(); + for (int i = 0; i < X.extent_int(0); ++i) { + _nrm(k) += IPT::norm(IPT::dot(X(i), X(i))); + } + + _nrm(k) = Kokkos::ArithTraits::sqrt(_nrm(k)); + } + + inline void run() { + std::string name_region("KokkosBlas::Test::SerialNrm2"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +struct Functor_TestBlasSerialNrm2MV { + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + ViewType _x; + norm_view_type _nrm; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) + : _x(x), _nrm(nrm) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL()); + auto R = Kokkos::subview(_nrm, i, Kokkos::ALL()); + KokkosBlas::serial_nrm2(X, R); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL()); + auto R = Kokkos::subview(_nrm, k, Kokkos::ALL()); + + for (int colIdx = 0; colIdx < X.extent_int(1); ++colIdx) { + R(colIdx) = Kokkos::ArithTraits::zero(); + for (int rowIdx = 0; rowIdx < X.extent_int(0); ++rowIdx) { + R(colIdx) += IPT::norm(IPT::dot(X(rowIdx, colIdx), X(rowIdx, colIdx))); + } + R(colIdx) = Kokkos::ArithTraits::sqrt(R(colIdx)); + } + } + + inline void run() { + std::string name_region("KokkosBlas::Test::SerialNrm2MV"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +void impl_test_blas_serial_nrm2(const int N, const int BlkSize) { + /// typedefs + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using ats = Kokkos::ArithTraits; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + /// radomized input testing views + ViewType X("X", N, BlkSize); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fence(); + + norm_view_type norms("norms", N); + norm_view_type norms_ref("ref norms", N); + + /// test body + Functor_TestBlasSerialNrm2(X, norms).run(); + Functor_TestBlasSerialNrm2(X, + norms_ref) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename norm_view_type::HostMirror norms_host = + Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = + Kokkos::create_mirror_view(norms_ref); + + Kokkos::deep_copy(norms_host, norms); + Kokkos::deep_copy(norms_ref_host, norms_ref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); +} + +template +void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, + const int numVecs) { + /// typedefs + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using ats = Kokkos::ArithTraits; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + /// radomized input testing views + ViewType X("X", N, vecLength, numVecs); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fence(); + + norm_view_type norms("norms", N, numVecs); + norm_view_type norms_ref("ref norms", N, numVecs); + + /// test body + Functor_TestBlasSerialNrm2MV(X, norms).run(); + Functor_TestBlasSerialNrm2MV(X, + norms_ref) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename norm_view_type::HostMirror norms_host = + Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = + Kokkos::create_mirror_view(norms_ref); + + Kokkos::deep_copy(norms_host, norms); + Kokkos::deep_copy(norms_ref_host, norms_ref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) + EXPECT_NEAR_KK(norms_host(k, vecIdx), norms_ref_host(k, vecIdx), eps); +} + +} // namespace Test + +template +int test_blas_serial_nrm2() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using ViewType = Kokkos::View; + Test::impl_test_blas_serial_nrm2(0, 10); + Test::impl_test_blas_serial_nrm2(10, 15); + Test::impl_test_blas_serial_nrm2(1024, 9); + Test::impl_test_blas_serial_nrm2(132231, 3); + + using MVViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2mv(0, 10, 5); + Test::impl_test_blas_serial_nrm2mv(10, 15, 7); + Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); + Test::impl_test_blas_serial_nrm2mv(132231, 3, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using ViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2(0, 10); + Test::impl_test_blas_serial_nrm2(10, 15); + Test::impl_test_blas_serial_nrm2(1024, 9); + Test::impl_test_blas_serial_nrm2(132231, 3); + + using MVViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2mv(0, 10, 5); + Test::impl_test_blas_serial_nrm2mv(10, 15, 5); + Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); + Test::impl_test_blas_serial_nrm2mv(132231, 3, 3); + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_nrm2_float_float) { + test_blas_serial_nrm2(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_nrm2_double_double) { + test_blas_serial_nrm2(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, serial_nrm2_fcomplex_float) { + test_blas_serial_nrm2 >(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { + test_blas_serial_nrm2 >(); +} +#endif + +#endif // TEST_BLAS_SERIAL_NRM2_HPP_ From 29b51b3824b8350891c54fa4546d7dda363445f9 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 18 Jul 2022 09:03:27 -0600 Subject: [PATCH 231/261] Device BLAS: applying clang-format --- unit_test/blas/Test_Blas_serial_axpy.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/unit_test/blas/Test_Blas_serial_axpy.hpp b/unit_test/blas/Test_Blas_serial_axpy.hpp index bd5dbcb5f6..83892640a7 100644 --- a/unit_test/blas/Test_Blas_serial_axpy.hpp +++ b/unit_test/blas/Test_Blas_serial_axpy.hpp @@ -92,8 +92,9 @@ struct Functor_TestBlasSerialAxpy { std::string name_work_tag = (std::is_same::value ? "::KokkosBlas" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); + : std::is_same::value + ? "::Naive" + : "::UnknownWorkTag"); std::string name_test_id = "Axpy"; std::string name = name_region + name_value_type + name_work_tag + name_test_id; @@ -128,8 +129,8 @@ void impl_test_blas_serial_axpy(const int N, const int BlkSize) { Functor_TestBlasSerialAxpy( alpha, X, Yref) .run(); - Functor_TestBlasSerialAxpy( - alpha, X, Y) + Functor_TestBlasSerialAxpy(alpha, X, Y) .run(); Kokkos::fence(); From d8b85cc06d0b05622973c16c327ef1b4fe3d84ed Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 18 Jul 2022 16:55:44 -0600 Subject: [PATCH 232/261] Fixing too large team size on complex_double bspgemm test --- .../impl/KokkosSparse_bspgemm_impl_kkmem.hpp | 116 ++++++++++-------- 1 file changed, 67 insertions(+), 49 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp index a30bbfd170..aae9d83b5f 100644 --- a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -99,11 +99,10 @@ struct KokkosBSPGEMMteam_size = team_size_; + this->thread_memory = (shared_memory_size / 8 / team_size_) * 8; + } + KOKKOS_INLINE_FUNCTION size_t get_thread_id(const size_t row_index) const { switch (my_exec_space) { @@ -282,9 +282,10 @@ struct KokkosBSPGEMMhandle->get_team_work_size( - suggested_team_size, this->concurrency, this->a_row_cnt); if (Base::KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" << thread_shmem_hash_size @@ -1495,7 +1498,7 @@ void KokkosBSPGEMMKOKKOSKERNELS_VERBOSE) { - std::cout << "\t\t max_nnz: " << max_nnz << " chunk_size:" << chunksize + std::cout << "\t\t max_nnz: " << max_nnz << " min_hash_size:" << min_hash_size << " concurrency:" << this->concurrency << " MyExecSpace::concurrency():" << MyExecSpace::concurrency() @@ -1532,12 +1535,11 @@ void KokkosBSPGEMMKOKKOSKERNELS_VERBOSE); + lcl_my_exec_space, first_level_cut_off, flops_per_row, + this->KOKKOSKERNELS_VERBOSE); if (this->KOKKOSKERNELS_VERBOSE) { std::cout << "\t\tvector_size:" << suggested_vector_size - << " chunk_size:" << team_row_chunk_size << " suggested_team_size:" << suggested_team_size << std::endl; } timer1.reset(); @@ -1555,10 +1557,14 @@ void KokkosBSPGEMMa_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), + gpu_team_policy4_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), sc); MyExecSpace().fence(); @@ -1574,10 +1580,14 @@ void KokkosBSPGEMMa_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), + gpu_team_policy6_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), sc); } else { if (team_shmem_key_size <= 0) { @@ -1591,10 +1601,14 @@ void KokkosBSPGEMMa_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), + gpu_team_policy_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), sc); } MyExecSpace().fence(); @@ -1603,13 +1617,15 @@ void KokkosBSPGEMMa_row_cnt / team_row_chunk_size + 1, + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, suggested_team_size, suggested_vector_size), sc); } else { Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC", multicore_team_policy4_t( - this->a_row_cnt / team_row_chunk_size + 1, + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, suggested_team_size, suggested_vector_size), sc); } @@ -1617,15 +1633,17 @@ void KokkosBSPGEMMa_row_cnt / team_row_chunk_size + 1, + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, suggested_team_size, suggested_vector_size), sc); } else { - Kokkos::parallel_for( - "KOKKOSPARSE::SPGEMM::KKMEM::STATIC", - multicore_team_policy_t(this->a_row_cnt / team_row_chunk_size + 1, - suggested_team_size, suggested_vector_size), - sc); + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::STATIC", + multicore_team_policy_t( + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, + suggested_team_size, suggested_vector_size), + sc); } } MyExecSpace().fence(); From 8a368d0c89088c4cb02b594f848f63c65e1f684c Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 18 Jul 2022 17:01:27 -0600 Subject: [PATCH 233/261] Fix D1 color ETI with both CudaSpace and UVM Since GaussSeidel can be instantiated with different temporary/persistent (aka fast/slow) memory spaces, and calls D1 coloring, D1 coloring itself also needs to be instantiated with fast/slow spaces. This fixes undefined reference errors on builds where UVM and CudaSpace are both instantiated. --- src/CMakeLists.txt | 2 +- src/graph/impl/KokkosGraph_color_d1_spec.hpp | 82 ++++++++++---------- 2 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8fd0bc21b8..a1c938aed5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -443,7 +443,7 @@ KOKKOSKERNELS_GENERATE_ETI(Graph_color_d1 color_d1 COMPONENTS graph HEADER_LIST ETI_HEADERS SOURCE_LIST SOURCES - TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) diff --git a/src/graph/impl/KokkosGraph_color_d1_spec.hpp b/src/graph/impl/KokkosGraph_color_d1_spec.hpp index 67cd09a099..09366f2c4e 100644 --- a/src/graph/impl/KokkosGraph_color_d1_spec.hpp +++ b/src/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -64,21 +64,21 @@ struct color_d1_eti_spec_avail { } // namespace Impl } // namespace KokkosGraph -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template <> \ - struct color_d1_eti_spec_avail< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>> { \ - enum : bool { value = true }; \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template <> \ + struct color_d1_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ }; // Include the actual specialization declarations @@ -118,34 +118,34 @@ struct COLOR_D1, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + extern template struct COLOR_D1< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; -#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST(SCALAR_TYPE, ORDINAL_TYPE, \ - OFFSET_TYPE, LAYOUT_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ - template struct COLOR_D1< \ - KokkosKernels::Experimental::KokkosKernelsHandle< \ - const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ - EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ - Kokkos::View, \ - Kokkos::MemoryTraits>, \ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template struct COLOR_D1< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ false, true>; #include From 65aceec90c864723b73aad1c277b9c31a4922bdd Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 19 Jul 2022 22:55:59 -0400 Subject: [PATCH 234/261] Fixup Batched GEMM cannot you Kokkos layout as WorkTag --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 7bc5529fcc..d6331e215d 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -59,6 +59,21 @@ namespace Impl { /// CT/NT, NT/CT, CT/CT /// +struct LayoutLeftTag {}; +struct LayoutRightTag {}; +template +struct TagFromLayoutHelper; +template <> +struct TagFromLayoutHelper { + using tag = LayoutLeftTag; +}; +template <> +struct TagFromLayoutHelper { + using tag = LayoutRightTag; +}; +template +using TagFromLayout = typename TagFromLayoutHelper::tag; + // TODO - scaling between (32x32, 64x64) // Option 0: Increase number of tiles and figure out how to map kokkos teams // into cuda grid. Keep team size and vector lanes constant. @@ -117,7 +132,8 @@ class BatchedDblBufGemm { private: void __run() { - using policy_type = Kokkos::TeamPolicy; + using policy_type = + Kokkos::TeamPolicy, execution_space_type>; using member_type = typename policy_type::member_type; // Compile-time expressions required for functor-level register allocations: @@ -335,8 +351,7 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void operator()(const Kokkos::LayoutRight &, - const MemberType &member) const { + void operator()(LayoutRightTag, const MemberType &member) const { // TODO: use Kokkos view with compile-time size to allocating register?? // Then we can use local deep copy for prefetch_reg population. // Allocate registers used for prefetching @@ -503,8 +518,7 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void operator()(const Kokkos::LayoutLeft &, - const MemberType &member) const { + void operator()(LayoutLeftTag, const MemberType &member) const { // TODO: use Kokkos view with compile-time size to allocating register?? // Then we can use local deep copy for prefetch_reg population. // Allocate registers used for prefetching From fdf340262c1d9b35c9515af1b39471515a30e0ff Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Tue, 19 Jul 2022 23:12:28 -0400 Subject: [PATCH 235/261] Fixup drop layout template param in rank-0 views --- src/blas/KokkosBlas_trtri.hpp | 2 +- src/blas/impl/KokkosBlas_trtri_spec.hpp | 6 +++--- src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 2 +- src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/blas/KokkosBlas_trtri.hpp b/src/blas/KokkosBlas_trtri.hpp index 0402b11104..5d170d3115 100644 --- a/src/blas/KokkosBlas_trtri.hpp +++ b/src/blas/KokkosBlas_trtri.hpp @@ -129,7 +129,7 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { // This is the return value type and should always reside on host using RViewInternalType = - Kokkos::View >; int result; diff --git a/src/blas/impl/KokkosBlas_trtri_spec.hpp b/src/blas/impl/KokkosBlas_trtri_spec.hpp index 1cccad1ea4..46014a6745 100644 --- a/src/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/src/blas/impl/KokkosBlas_trtri_spec.hpp @@ -69,7 +69,7 @@ struct trtri_eti_spec_avail { MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ @@ -136,7 +136,7 @@ struct TRTRI { // #define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ extern template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -144,7 +144,7 @@ struct TRTRI { #define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index 974fe76eb0..cde6398073 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -58,7 +58,7 @@ struct trtri_tpl_spec_avail { #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ template \ struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp index af9f843938..f1cabea576 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp @@ -55,14 +55,14 @@ namespace Impl { #define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View Date: Wed, 20 Jul 2022 10:04:05 -0600 Subject: [PATCH 236/261] Make layout explicit again for 0D --- src/blas/KokkosBlas_trtri.hpp | 2 +- src/blas/impl/KokkosBlas_trtri_spec.hpp | 6 +++--- src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 2 +- src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/blas/KokkosBlas_trtri.hpp b/src/blas/KokkosBlas_trtri.hpp index 5d170d3115..afcc05d5ae 100644 --- a/src/blas/KokkosBlas_trtri.hpp +++ b/src/blas/KokkosBlas_trtri.hpp @@ -129,7 +129,7 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { // This is the return value type and should always reside on host using RViewInternalType = - Kokkos::View >; int result; diff --git a/src/blas/impl/KokkosBlas_trtri_spec.hpp b/src/blas/impl/KokkosBlas_trtri_spec.hpp index 46014a6745..0bbeb294dc 100644 --- a/src/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/src/blas/impl/KokkosBlas_trtri_spec.hpp @@ -69,7 +69,7 @@ struct trtri_eti_spec_avail { MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ @@ -136,7 +136,7 @@ struct TRTRI { // #define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ extern template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -144,7 +144,7 @@ struct TRTRI { #define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index cde6398073..c025a1a11e 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -58,7 +58,7 @@ struct trtri_tpl_spec_avail { #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ template \ struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp index f1cabea576..af6c186039 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp @@ -55,14 +55,14 @@ namespace Impl { #define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View Date: Wed, 20 Jul 2022 10:15:22 -0700 Subject: [PATCH 237/261] Update spiluk numeric --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 9f9b5ef73c..e30a12d22e 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -689,6 +689,8 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; + //using WorkViewType = + // Kokkos::View>; using WorkViewType = Kokkos::View>; using LevelHostViewType = Kokkos::View; From 941387e6d5a911f6076fc82b58020120649451fc Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 20 Jul 2022 15:37:43 -0700 Subject: [PATCH 238/261] Some clean ups --- src/sparse/KokkosSparse_spiluk_handle.hpp | 36 +++++++++++---- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 44 ++++++++----------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 40 +++++++---------- 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 1bf520c02b..227902a1af 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -92,6 +92,9 @@ class SPILUKHandle { typedef typename Kokkos::View nnz_row_view_host_t; + typedef typename Kokkos::View + nnz_lno_view_host_t; + typedef typename std::make_signed< typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; typedef Kokkos::View signed_nnz_lno_view_t; + typedef Kokkos::View work_view_t; + private: nnz_row_view_t level_list; // level IDs which the rows belong to nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level - nnz_lno_view_t level_nchunks; // number of chunks of rows at each level - nnz_lno_view_t + nnz_lno_view_host_t level_nchunks; // number of chunks of rows at each level + nnz_lno_view_host_t level_nrowsperchunk; // maximum number of rows among chunks at each level nnz_row_view_host_t level_maxnnzperrow; // maximum number of nnz per row at each level @@ -114,6 +119,7 @@ class SPILUKHandle { // hash map at each level nnz_row_view_host_t level_shmem_key_size; // key size in the shared memory // hash map at each level + work_view_t iw;//working view for mapping dense indices to sparse indices size_type nrows; size_type nlevels; @@ -142,6 +148,7 @@ class SPILUKHandle { level_maxnnzperrow(), level_shmem_hash_size(), level_shmem_key_size(), + iw(), nrows(nrows_), nlevels(0), nnzL(nnzL_), @@ -164,10 +171,12 @@ class SPILUKHandle { level_list = nnz_row_view_t("level_list", nrows_), level_idx = nnz_lno_view_t("level_idx", nrows_), level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), + level_nchunks = nnz_lno_view_host_t(), + level_nrowsperchunk = nnz_lno_view_host_t(), level_maxnnzperrow = nnz_row_view_host_t(), level_shmem_hash_size = nnz_row_view_host_t(), - level_shmem_key_size = nnz_row_view_host_t(), reset_symbolic_complete(); + level_shmem_key_size = nnz_row_view_host_t(), reset_symbolic_complete(), + iw = work_view_t(); } virtual ~SPILUKHandle(){}; @@ -186,17 +195,17 @@ class SPILUKHandle { nnz_lno_view_t get_level_ptr() const { return level_ptr; } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nchunks() const { return level_nchunks; } + nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; } void alloc_level_nchunks(const size_type nlevels_) { - level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_); + level_nchunks = nnz_lno_view_host_t("level_nchunks", nlevels_); } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } + nnz_lno_view_host_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } void alloc_level_nrowsperchunk(const size_type nlevels_) { - level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); + level_nrowsperchunk = nnz_lno_view_host_t("level_nrowsperchunk", nlevels_); } KOKKOS_INLINE_FUNCTION @@ -228,6 +237,17 @@ class SPILUKHandle { nnz_row_view_host_t("level_shmem_key_size", nlevels_); } + KOKKOS_INLINE_FUNCTION + work_view_t get_iw() const { + return iw; + } + + void alloc_iw(const size_type nrows_, const size_type ncols_) { + iw = work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + nrows_, ncols_); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); + } + KOKKOS_INLINE_FUNCTION size_type get_nrows() const { return nrows; } diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index e30a12d22e..baa8c318de 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -52,6 +52,8 @@ #include #include +#include + //#define NUMERIC_OUTPUT_INFO //#define NUMERIC_USE_FOR @@ -689,21 +691,18 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - //using WorkViewType = - // Kokkos::View>; - using WorkViewType = - Kokkos::View>; - using LevelHostViewType = Kokkos::View; - + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; + + struct timeval begin, end;//VINH TEST + gettimeofday( &begin, NULL ); + size_type nlevels = thandle.get_num_levels(); size_type nrows = thandle.get_nrows(); // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType level_nrowsperchunk = - thandle.get_level_nrowsperchunk(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -717,6 +716,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); + gettimeofday( &end, NULL ); + printf(" VINH TEST: numeric -- copy level_ptr %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); @@ -763,29 +765,17 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end for lvl } // End SEQLVLSCHD_TP1HASHMAP else { + gettimeofday( &begin, NULL ); if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = LevelHostViewType( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), - level_nchunks.extent(0)); - level_nrowsperchunk_h = - LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, - "Host level nrowsperchunk"), - level_nrowsperchunk.extent(0)); - Kokkos::deep_copy(level_nchunks_h, level_nchunks); - Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrowsperchunk(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); - } else { - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrows(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); } + iw = thandle.get_iw(); // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead - printf("work array iw %d x %d, type %s\n",iw.extent(0),iw.extent(1),typeid(WorkViewType).name()); + printf("work array iw (alloc at symbolic) %d x %d, type %s, nlevels %d\n",iw.extent(0),iw.extent(1),typeid(WorkViewType).name(), nlevels); int tmpcnt = 0; int tmpnrows = 0; for (size_type lvl = 0; lvl < nlevels; ++lvl) { @@ -846,6 +836,8 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end if } // end for lvl printf("Total kernel calls %d, total nrows %d\n",tmpcnt, tmpnrows); + gettimeofday( &end, NULL ); + printf(" VINH TEST: numeric -- main %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); } // Output check diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 79298d14ed..0d17a3436e 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -123,15 +123,15 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // SEQLVLSCHD_TP1 algorithm (chunks) template -void level_sched(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, - LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, - size_type& nlevels) { + class LevelType1, class LevelType2, class size_type> +void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_view_host_t = typename IlukHandle::nnz_lno_view_host_t; size_type nrows = thandle.get_nrows(); @@ -170,11 +170,10 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, level_ptr(0) = 0; // Find max rows, number of chunks, max rows of chunks across levels - using HostViewType = - Kokkos::View; - - HostViewType lnchunks("lnchunks", nlevels); - HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); + thandle.alloc_level_nchunks(nlevels); + thandle.alloc_level_nrowsperchunk(nlevels); + nnz_lno_view_host_t lnchunks = thandle.get_level_nchunks(); + nnz_lno_view_host_t lnrowsperchunk= thandle.get_level_nrowsperchunk(); #ifdef KOKKOS_ENABLE_CUDA using memory_space = typename IlukHandle::memory_space; @@ -222,8 +221,7 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - level_nchunks = lnchunks; printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, maxrowsperchunk); - level_nrowsperchunk = lnrowsperchunk; + printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, maxrowsperchunk); } template Date: Tue, 19 Jul 2022 22:25:04 +0000 Subject: [PATCH 239/261] BLAS: fixing test that access results before synching For some kernels that return results in host views the implementation is non-blocking and requires to do a Kokkos::fence() before checking the value against the reference value. --- unit_test/blas/Test_Blas1_dot.hpp | 4 ++++ unit_test/blas/Test_Blas1_iamax.hpp | 3 +++ unit_test/blas/Test_Blas1_nrm1.hpp | 1 + unit_test/blas/Test_Blas1_nrm2.hpp | 2 ++ unit_test/blas/Test_Blas1_nrm2_squared.hpp | 2 ++ unit_test/blas/Test_Blas1_sum.hpp | 2 ++ 6 files changed, 14 insertions(+) diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index b2e3f95628..83dfd6048c 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -111,6 +111,7 @@ void impl_test_dot_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::dot(r, a, b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], @@ -118,6 +119,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, c_a, c_b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); EXPECT_NEAR_KK(const_const_result, expected_result[k], @@ -125,6 +127,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, a, c_b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); EXPECT_NEAR_KK(non_const_const_result, expected_result[k], @@ -132,6 +135,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, c_a, b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp index 88c21be83c..82f1fc1c76 100644 --- a/unit_test/blas/Test_Blas1_iamax.hpp +++ b/unit_test/blas/Test_Blas1_iamax.hpp @@ -61,6 +61,7 @@ void impl_test_iamax(int N) { ViewType0D r("Iamax::Result 0-D View on host"); KokkosBlas::iamax(r, a); + Kokkos::fence(); size_type nonconst_max_loc = r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); @@ -151,6 +152,7 @@ void impl_test_iamax_mv(int N, int K) { r("Iamax::Result View on host", K); KokkosBlas::iamax(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { size_type nonconst_result = r(k); @@ -159,6 +161,7 @@ void impl_test_iamax_mv(int N, int K) { } KokkosBlas::iamax(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { size_type const_result = r(k); diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index c68492b6dd..1c476cbf43 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -98,6 +98,7 @@ void impl_test_nrm1_mv(int N, int K) { KokkosBlas::nrm1(r, a); KokkosBlas::nrm1(c_r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k)); EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k)); diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 688035f842..c568b12564 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -84,6 +84,7 @@ void impl_test_nrm2_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_result, expected_result[k], @@ -91,6 +92,7 @@ void impl_test_nrm2_mv(int N, int K) { } KokkosBlas::nrm2(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]); diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index 317b9b543b..98c2cf7e8f 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -93,6 +93,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2_squared(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); typename AT::mag_type divisor = @@ -103,6 +104,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { } KokkosBlas::nrm2_squared(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); typename AT::mag_type divisor = diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index 2b7f51370e..5ad2ef038b 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -73,6 +73,7 @@ void impl_test_sum_mv(int N, int K) { Kokkos::View r("Sum::Result", K); KokkosBlas::sum(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_result, expected_result[k], @@ -80,6 +81,7 @@ void impl_test_sum_mv(int N, int K) { } KokkosBlas::sum(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]); From 48b6a72fe7b98fcd799fa46042eb51190d68708a Mon Sep 17 00:00:00 2001 From: Luc Date: Tue, 19 Jul 2022 23:35:28 +0000 Subject: [PATCH 240/261] sycl: re-enabling test now that dpcpp has made progress With the latest dpcpp compiler drop we can turn back on a couple of test. The ArithTraits is also mostly using Kokkos implementation of math functions and numeric traits so we should be able to blame them for new issues :p Signed-off-by: Luc Berger-Vergiat --- unit_test/common/Test_Common.hpp | 3 --- unit_test/graph/Test_Graph_graph_color.hpp | 3 --- 2 files changed, 6 deletions(-) diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp index cc4204d076..20b875f4a5 100644 --- a/unit_test/common/Test_Common.hpp +++ b/unit_test/common/Test_Common.hpp @@ -1,10 +1,7 @@ #ifndef TEST_COMMON_HPP #define TEST_COMMON_HPP -// FIXME_SYCL still some uses of the wrong namespace -#ifndef KOKKOS_ENABLE_SYCL #include -#endif // #include #include #include diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp index 67b319b0c3..4d35874657 100644 --- a/unit_test/graph/Test_Graph_graph_color.hpp +++ b/unit_test/graph/Test_Graph_graph_color.hpp @@ -231,15 +231,12 @@ EXECUTE_TEST(default_scalar, int, int, TestExecSpace) EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) #endif -// FIXME_SYCL -#ifndef KOKKOS_ENABLE_SYCL #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) #endif -#endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ From 8b747ea409bcc0dc42c1d100b6ba81a30ab9c23c Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 21 Jul 2022 00:25:22 -0700 Subject: [PATCH 241/261] Apply clang format --- src/sparse/KokkosSparse_spiluk_handle.hpp | 28 +-- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 173 ++++++++++-------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 66 ++++--- 3 files changed, 150 insertions(+), 117 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 227902a1af..1ec2d3533c 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -103,7 +103,9 @@ class SPILUKHandle { typename nnz_row_view_t::memory_traits> signed_nnz_lno_view_t; - typedef Kokkos::View work_view_t; + typedef Kokkos::View + work_view_t; private: nnz_row_view_t level_list; // level IDs which the rows belong to @@ -117,9 +119,9 @@ class SPILUKHandle { level_maxnnzperrow; // maximum number of nnz per row at each level nnz_row_view_host_t level_shmem_hash_size; // hash size in the shared memory // hash map at each level - nnz_row_view_host_t level_shmem_key_size; // key size in the shared memory - // hash map at each level - work_view_t iw;//working view for mapping dense indices to sparse indices + nnz_row_view_host_t level_shmem_key_size; // key size in the shared memory + // hash map at each level + work_view_t iw; // working view for mapping dense indices to sparse indices size_type nrows; size_type nlevels; @@ -168,15 +170,15 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_host_t(), + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_host_t(), level_nrowsperchunk = nnz_lno_view_host_t(), level_maxnnzperrow = nnz_row_view_host_t(), level_shmem_hash_size = nnz_row_view_host_t(), level_shmem_key_size = nnz_row_view_host_t(), reset_symbolic_complete(), - iw = work_view_t(); + iw = work_view_t(); } virtual ~SPILUKHandle(){}; @@ -202,7 +204,9 @@ class SPILUKHandle { } KOKKOS_INLINE_FUNCTION - nnz_lno_view_host_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } + nnz_lno_view_host_t get_level_nrowsperchunk() const { + return level_nrowsperchunk; + } void alloc_level_nrowsperchunk(const size_type nlevels_) { level_nrowsperchunk = nnz_lno_view_host_t("level_nrowsperchunk", nlevels_); @@ -238,9 +242,7 @@ class SPILUKHandle { } KOKKOS_INLINE_FUNCTION - work_view_t get_iw() const { - return iw; - } + work_view_t get_iw() const { return iw; } void alloc_iw(const size_type nrows_, const size_type ncols_) { iw = work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index baa8c318de..14613adef1 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -210,18 +210,18 @@ struct ILUKLvlSchedTP1NumericFunctor { using lno_t = typename AEntriesType::non_const_value_type; using scalar_t = typename AValuesType::non_const_value_type; - ARowMapType A_row_map; + ARowMapType A_row_map; AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; + AValuesType A_values; + LRowMapType L_row_map; LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; + LValuesType L_values; + URowMapType U_row_map; UEntriesType U_entries; - UValuesType U_values; + UValuesType U_values; LevelViewType level_idx; WorkViewType iw; - nnz_lno_t lev_start; + nnz_lno_t lev_start; ILUKLvlSchedTP1NumericFunctor( const ARowMapType &A_row_map_, const AEntriesType &A_entries_, @@ -245,8 +245,9 @@ struct ILUKLvlSchedTP1NumericFunctor { KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - nnz_lno_t my_team = static_cast(team.league_rank()); - nnz_lno_t rowid = static_cast(level_idx(my_team + lev_start));// map to rowid + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid nnz_lno_t my_thread = static_cast(team.team_rank()); nnz_lno_t ts = static_cast(team.team_size()); @@ -254,11 +255,12 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = k; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { nnz_lno_t col = static_cast(L_entries(k)); @@ -268,11 +270,12 @@ struct ILUKLvlSchedTP1NumericFunctor { #endif #else #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = k; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = k; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { nnz_lno_t col = static_cast(L_entries(k)); @@ -283,7 +286,7 @@ struct ILUKLvlSchedTP1NumericFunctor { #endif #ifdef KEEP_DIAG - //if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); + // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif @@ -293,11 +296,12 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = static_cast(U_row_map(rowid)); k2 = static_cast(U_row_map(rowid + 1)); #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - iw(my_team, col) = k; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = k; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { nnz_lno_t col = static_cast(U_entries(k)); @@ -312,14 +316,15 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = static_cast(A_row_map(rowid)); k2 = static_cast(A_row_map(rowid + 1)); #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_team, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); + if (col < rowid) + L_values(ipos) = A_values(k); + else + U_values(ipos) = A_values(k); + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { nnz_lno_t col = static_cast(A_entries(k)); @@ -348,27 +353,31 @@ struct ILUKLvlSchedTP1NumericFunctor { #else scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - //if (my_thread == 0) L_values(k) = fact; + // if (my_thread == 0) L_values(k) = fact; Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; - if (ipos != -1) { - if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); - else - Kokkos::atomic_add(&U_values(ipos), lxu); - } - }); // end for kk + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, + U_row_map(prev_row + 1)), + [&](const size_type kk) { + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; + if (ipos != -1) { + if (col < rowid) + Kokkos::atomic_add(&L_values(ipos), lxu); + else + Kokkos::atomic_add(&U_values(ipos), lxu); + } + }); // end for kk #else - for (nnz_lno_t kk = U_row_map(prev_row) + 1 + my_thread; kk < U_row_map(prev_row + 1); kk += ts) { + for (nnz_lno_t kk = U_row_map(prev_row) + 1 + my_thread; + kk < U_row_map(prev_row + 1); kk += ts) { nnz_lno_t col = static_cast(U_entries(kk)); nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; + auto lxu = -U_values(kk) * fact; if (ipos != -1) { if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); @@ -380,7 +389,7 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); } // end for k - //if (my_thread == 0) { + // if (my_thread == 0) { Kokkos::single(Kokkos::PerTeam(team), [&]() { nnz_lno_t ipos = iw(my_team, rowid); #ifdef KEEP_DIAG @@ -404,26 +413,28 @@ struct ILUKLvlSchedTP1NumericFunctor { k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); + nnz_lno_t col = static_cast(L_entries(k)); iw(my_team, col) = -1; } #endif #else #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; } #endif #endif @@ -431,13 +442,14 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = static_cast(U_row_map(rowid)); k2 = static_cast(U_row_map(rowid + 1)); #ifndef NUMERIC_USE_FOR - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_team, col) = -1; - }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const nnz_lno_t k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; + }); #else for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(U_entries(k)); + nnz_lno_t col = static_cast(U_entries(k)); iw(my_team, col) = -1; } #endif @@ -693,16 +705,16 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; using WorkViewType = typename IlukHandle::work_view_t; using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; - - struct timeval begin, end;//VINH TEST - gettimeofday( &begin, NULL ); - + + struct timeval begin, end; // VINH TEST + gettimeofday(&begin, NULL); + size_type nlevels = thandle.get_num_levels(); size_type nrows = thandle.get_nrows(); // Keep these as host View, create device version and copy back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); + HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -716,8 +728,10 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); - gettimeofday( &end, NULL ); - printf(" VINH TEST: numeric -- copy level_ptr %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + gettimeofday(&end, NULL); + printf(" VINH TEST: numeric -- copy level_ptr %.8lf (sec.)\n", + 1.0 * (end.tv_sec - begin.tv_sec) + + 1.0e-6 * (end.tv_usec - begin.tv_usec)); if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { @@ -765,7 +779,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } // end for lvl } // End SEQLVLSCHD_TP1HASHMAP else { - gettimeofday( &begin, NULL ); + gettimeofday(&begin, NULL); if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_nchunks_h = thandle.get_level_nchunks(); @@ -775,8 +789,9 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead - printf("work array iw (alloc at symbolic) %d x %d, type %s, nlevels %d\n",iw.extent(0),iw.extent(1),typeid(WorkViewType).name(), nlevels); - int tmpcnt = 0; + printf("work array iw (alloc at symbolic) %d x %d, type %s, nlevels %d\n", + iw.extent(0), iw.extent(1), typeid(WorkViewType).name(), nlevels); + int tmpcnt = 0; int tmpnrows = 0; for (size_type lvl = 0; lvl < nlevels; ++lvl) { nnz_lno_t lev_start = level_ptr_h(lvl); @@ -835,9 +850,11 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } } // end if } // end for lvl - printf("Total kernel calls %d, total nrows %d\n",tmpcnt, tmpnrows); - gettimeofday( &end, NULL ); - printf(" VINH TEST: numeric -- main %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + printf("Total kernel calls %d, total nrows %d\n", tmpcnt, tmpnrows); + gettimeofday(&end, NULL); + printf(" VINH TEST: numeric -- main %.8lf (sec.)\n", + 1.0 * (end.tv_sec - begin.tv_sec) + + 1.0e-6 * (end.tv_usec - begin.tv_usec)); } // Output check diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 0d17a3436e..c0b7a3baff 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -130,7 +130,7 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, size_type& nlevels) { // Scheduling currently compute on host - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; using nnz_lno_view_host_t = typename IlukHandle::nnz_lno_view_host_t; size_type nrows = thandle.get_nrows(); @@ -172,8 +172,8 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, // Find max rows, number of chunks, max rows of chunks across levels thandle.alloc_level_nchunks(nlevels); thandle.alloc_level_nrowsperchunk(nlevels); - nnz_lno_view_host_t lnchunks = thandle.get_level_nchunks(); - nnz_lno_view_host_t lnrowsperchunk= thandle.get_level_nrowsperchunk(); + nnz_lno_view_host_t lnchunks = thandle.get_level_nchunks(); + nnz_lno_view_host_t lnrowsperchunk = thandle.get_level_nrowsperchunk(); #ifdef KOKKOS_ENABLE_CUDA using memory_space = typename IlukHandle::memory_space; @@ -201,11 +201,19 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); - if ((i < 10) || (i >= nlevels-10)) - printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); - //if (lnrows == 312) + if ((i < 10) || (i >= nlevels - 10)) + printf( + "Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, " + "nchunks %d, rows per chunk %d\n", + i, lnrows, nrows, required_size, avail_byte, lnchunks(i), + lnrowsperchunk(i)); + // if (lnrows == 312) if (lnrows > 250) - printf("Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, nchunks %d, rows per chunk %d\n",i,lnrows,nrows,required_size,avail_byte,lnchunks(i),lnrowsperchunk(i)); + printf( + "Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, " + "nchunks %d, rows per chunk %d\n", + i, lnrows, nrows, required_size, avail_byte, lnchunks(i), + lnrowsperchunk(i)); } else #endif { @@ -221,7 +229,8 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, maxrowsperchunk); + printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, + maxrowsperchunk); } template ; - struct timeval begin, end;//VINH TEST - gettimeofday( &begin, NULL ); + struct timeval begin, end; // VINH TEST + gettimeofday(&begin, NULL); HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); HostTmpViewType h_llev("h_llev", nrows); - HostTmpViewType level_nchunks, level_nrowsperchunk; size_type cntL = 0; size_type cntU = 0; @@ -588,11 +596,13 @@ void iluk_symbolic(IlukHandle& thandle, thandle.set_nnzL(cntL); thandle.set_nnzU(cntU); - gettimeofday( &end, NULL ); - printf(" VINH TEST: symbolic -- main %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + gettimeofday(&end, NULL); + printf(" VINH TEST: symbolic -- main %.8lf (sec.)\n", + 1.0 * (end.tv_sec - begin.tv_sec) + + 1.0e-6 * (end.tv_usec - begin.tv_usec)); // Sort - gettimeofday( &begin, NULL ); + gettimeofday(&begin, NULL); for (size_type row_id = 0; row_id < static_cast(L_row_map.extent(0)) - 1; row_id++) { size_type row_start = L_row_map(row_id); @@ -605,29 +615,31 @@ void iluk_symbolic(IlukHandle& thandle, size_type row_end = U_row_map(row_id + 1); Kokkos::sort(subview(U_entries, Kokkos::make_pair(row_start, row_end))); } - gettimeofday( &end, NULL ); - printf(" VINH TEST: symbolic -- sort %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + gettimeofday(&end, NULL); + printf(" VINH TEST: symbolic -- sort %.8lf (sec.)\n", + 1.0 * (end.tv_sec - begin.tv_sec) + + 1.0e-6 * (end.tv_usec - begin.tv_usec)); // Level scheduling on L - gettimeofday( &begin, NULL ); + gettimeofday(&begin, NULL); if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, level_list, level_ptr, level_idx, nlev); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - printf ("LEVEL SCHED on L\n"); + printf("LEVEL SCHED on L\n"); level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, nlev);//ORIG - //Level scheduling on A??? - //printf ("LEVEL SCHED on A\n"); - //level_sched (thandle, A_row_map, A_entries, level_list, level_ptr, + level_idx, nlev); // ORIG + // Level scheduling on A??? + // printf ("LEVEL SCHED on A\n"); + // level_sched (thandle, A_row_map, A_entries, level_list, level_ptr, // level_idx, nlev); - thandle.alloc_iw(thandle.get_level_maxrowsperchunk(),nrows); + thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); - thandle.alloc_iw(thandle.get_level_maxrows(),nrows); + thandle.alloc_iw(thandle.get_level_maxrows(), nrows); } Kokkos::deep_copy(dlevel_ptr, level_ptr); @@ -640,8 +652,10 @@ void iluk_symbolic(IlukHandle& thandle, Kokkos::deep_copy(U_entries_d, U_entries); thandle.set_symbolic_complete(); - gettimeofday( &end, NULL ); - printf(" VINH TEST: symbolic -- sched + copy %.8lf (sec.)\n", 1.0 * ( end.tv_sec - begin.tv_sec ) + 1.0e-6 * ( end.tv_usec - begin.tv_usec )); + gettimeofday(&end, NULL); + printf(" VINH TEST: symbolic -- sched + copy %.8lf (sec.)\n", + 1.0 * (end.tv_sec - begin.tv_sec) + + 1.0e-6 * (end.tv_usec - begin.tv_usec)); // Output check #ifdef SYMBOLIC_OUTPUT_INFO From 7d14979cf540d27c08e1ea1dc6506f0a6a99ff70 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 21 Jul 2022 01:23:04 -0700 Subject: [PATCH 242/261] Remove printf --- .../KokkosKernels_HashmapAccumulator.hpp | 16 - src/sparse/KokkosSparse_spiluk_handle.hpp | 92 +-- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 566 ++++++++---------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 57 +- 4 files changed, 291 insertions(+), 440 deletions(-) diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index 11cc2b1cf4..c6397fd9ea 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -779,22 +779,6 @@ struct HashmapAccumulator { return __insert_success; } } - - // function to be called from device. - KOKKOS_INLINE_FUNCTION - size_type find(const key_type &key) { - size_type hash, i; - - if (key == -1) return -1; - - hash = __compute_hash(key, __hashOpRHS); - for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { - if (keys[i] == key) { - return i; - } - } - return -1; - } // end public members private: size_type __max_value_size; diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 1ec2d3533c..e449b97057 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -59,8 +59,8 @@ namespace Experimental { // TP2 algorithm has issues with some offset-ordinal combo to be addressed enum class SPILUKAlgorithm { SEQLVLSCHD_RP, - SEQLVLSCHD_TP1, /*, SEQLVLSCHED_TP2*/ - SEQLVLSCHD_TP1HASHMAP + SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/ + //SEQLVLSCHD_TP1HASHMAP }; template #include -#include - //#define NUMERIC_OUTPUT_INFO -//#define NUMERIC_USE_FOR namespace KokkosSparse { namespace Impl { @@ -254,7 +251,6 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t k1 = static_cast(L_row_map(rowid)); nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(L_entries(k)); @@ -262,27 +258,12 @@ struct ILUKLvlSchedTP1NumericFunctor { iw(my_team, col) = k; }); #else - for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = k; - } -#endif -#else -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(L_entries(k)); L_values(k) = 0.0; iw(my_team, col) = k; }); -#else - for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - iw(my_team, col) = k; - } -#endif #endif #ifdef KEEP_DIAG @@ -295,27 +276,18 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = static_cast(U_row_map(rowid)); k2 = static_cast(U_row_map(rowid + 1)); -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(U_entries(k)); U_values(k) = 0.0; iw(my_team, col) = k; }); -#else - for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - iw(my_team, col) = k; - } -#endif team.team_barrier(); // Unpack the ith row of A k1 = static_cast(A_row_map(rowid)); k2 = static_cast(A_row_map(rowid + 1)); -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(A_entries(k)); @@ -325,16 +297,6 @@ struct ILUKLvlSchedTP1NumericFunctor { else U_values(ipos) = A_values(k); }); -#else - for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t ipos = iw(my_team, col); - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - } -#endif team.team_barrier(); @@ -357,7 +319,7 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); -#ifndef NUMERIC_USE_FOR + Kokkos::parallel_for( Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), @@ -372,20 +334,7 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::atomic_add(&U_values(ipos), lxu); } }); // end for kk -#else - for (nnz_lno_t kk = U_row_map(prev_row) + 1 + my_thread; - kk < U_row_map(prev_row + 1); kk += ts) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t ipos = iw(my_team, col); - auto lxu = -U_values(kk) * fact; - if (ipos != -1) { - if (col < rowid) - Kokkos::atomic_add(&L_values(ipos), lxu); - else - Kokkos::atomic_add(&U_values(ipos), lxu); - } - } // end for kk -#endif + team.team_barrier(); } // end for k @@ -412,282 +361,261 @@ struct ILUKLvlSchedTP1NumericFunctor { k1 = static_cast(L_row_map(rowid)); k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(L_entries(k)); iw(my_team, col) = -1; }); #else - for (nnz_lno_t k = k1 + my_thread; k < k2 - 1; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - } -#endif -#else -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(L_entries(k)); iw(my_team, col) = -1; }); -#else - for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(L_entries(k)); - iw(my_team, col) = -1; - } -#endif #endif k1 = static_cast(U_row_map(rowid)); k2 = static_cast(U_row_map(rowid + 1)); -#ifndef NUMERIC_USE_FOR Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { nnz_lno_t col = static_cast(U_entries(k)); iw(my_team, col) = -1; }); -#else - for (nnz_lno_t k = k1 + my_thread; k < k2; k += ts) { - nnz_lno_t col = static_cast(U_entries(k)); - iw(my_team, col) = -1; - } -#endif } }; -template -struct ILUKLvlSchedTP1HashMapNumericFunctor { - using execution_space = typename ARowMapType::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - using size_type = typename ARowMapType::non_const_value_type; - using scalar_t = typename AValuesType::non_const_value_type; - using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator< - nnz_lno_t, nnz_lno_t, nnz_lno_t, - KokkosKernels::Experimental::HashOpType::bitwiseAnd>; - - ARowMapType A_row_map; - AEntriesType A_entries; - AValuesType A_values; - LRowMapType L_row_map; - LEntriesType L_entries; - LValuesType L_values; - URowMapType U_row_map; - UEntriesType U_entries; - UValuesType U_values; - LevelViewType level_idx; - nnz_lno_t lev_start; - nnz_lno_t shmem_hash_size; - nnz_lno_t shmem_key_size; - nnz_lno_t shared_memory_hash_func; - nnz_lno_t shmem_size; - - ILUKLvlSchedTP1HashMapNumericFunctor( - const ARowMapType &A_row_map_, const AEntriesType &A_entries_, - const AValuesType &A_values_, const LRowMapType &L_row_map_, - const LEntriesType &L_entries_, LValuesType &L_values_, - const URowMapType &U_row_map_, const UEntriesType &U_entries_, - UValuesType &U_values_, const LevelViewType &level_idx_, - const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, - const nnz_lno_t &shmem_key_size_, - const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) - : A_row_map(A_row_map_), - A_entries(A_entries_), - A_values(A_values_), - L_row_map(L_row_map_), - L_entries(L_entries_), - L_values(L_values_), - U_row_map(U_row_map_), - U_entries(U_entries_), - U_values(U_values_), - level_idx(level_idx_), - lev_start(lev_start_), - shmem_hash_size(shmem_hash_size_), - shmem_key_size(shmem_key_size_), - shared_memory_hash_func(shared_memory_hash_func_), - shmem_size(shmem_size_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // teamid - auto rowid = level_idx(my_league + lev_start); // teamid-->rowid - // auto my_team = team.team_rank(); - - // START shared hash map initialization - char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); - - // Threads in a team share 4 arrays: begin, next, keys, values - // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd - // level hash right now) - volatile nnz_lno_t *used_hash_sizes = - (volatile nnz_lno_t *)(all_shared_memory); - all_shared_memory += sizeof(nnz_lno_t) * 2; - - // points to begin array - nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); - all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; - - // points to the next elements - nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); - all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; - - // holds the keys and vals - nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); - all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; - nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory); - - hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, - keys, vals); - - // initialize begins - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), - [&](int i) { begins[i] = -1; }); - - // initialize hash usage sizes - Kokkos::single(Kokkos::PerTeam(team), [&]() { - used_hash_sizes[0] = 0; - used_hash_sizes[1] = 0; - }); - - team.team_barrier(); - // Shared hash map initialization DONE - - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( - col, k, used_hash_sizes); - }); -#else - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(L_entries(k)); - L_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( - col, k, used_hash_sizes); - }); -#endif - -#ifdef KEEP_DIAG - // if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); - Kokkos::single(Kokkos::PerTeam(team), - [&]() { L_values(k2 - 1) = scalar_t(1.0); }); -#endif - - team.team_barrier(); - - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(U_entries(k)); - U_values(k) = 0.0; - int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( - col, k, used_hash_sizes); - }); - - // Kokkos::single(Kokkos::PerTeam(team),[&] () { - // if (temp_nnz_cnt > shmem_key_size) - // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, - // shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, - // shmem_key_size); - //}); - - team.team_barrier(); - - // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { - nnz_lno_t col = static_cast(A_entries(k)); - nnz_lno_t hashmap_idx = hm.find(col); - if (hashmap_idx != -1) { - nnz_lno_t ipos = hm.values[hashmap_idx]; - if (col < rowid) - L_values(ipos) = A_values(k); - else - U_values(ipos) = A_values(k); - } - }); - - team.team_barrier(); - - // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); -#ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) -#else - for (auto k = k1; k < k2; ++k) -#endif - { - auto prev_row = L_entries(k); -#ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); -#else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); -#endif - // if ( my_team == 0 ) L_values(k) = fact; - Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); - - team.team_barrier(); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, - U_row_map(prev_row + 1)), - [&](const size_type kk) { - nnz_lno_t col = static_cast(U_entries(kk)); - nnz_lno_t hashmap_idx = hm.find(col); - if (hashmap_idx != -1) { - nnz_lno_t ipos = hm.values[hashmap_idx]; - auto lxu = -U_values(kk) * fact; - if (col < rowid) - // L_values(ipos) += lxu; - Kokkos::atomic_add(&L_values(ipos), lxu); - else - // U_values(ipos) += lxu; - Kokkos::atomic_add(&U_values(ipos), lxu); - } - }); // end for kk - - team.team_barrier(); - } // end for k - - // if ( my_team == 0 ) { - Kokkos::single(Kokkos::PerTeam(team), [&]() { - nnz_lno_t hashmap_idx = hm.find(rowid); - if (hashmap_idx != -1) { - nnz_lno_t ipos = hm.values[hashmap_idx]; -#ifdef KEEP_DIAG - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; - } -#else - if (U_values(ipos) == 0.0) { - U_values(ipos) = 1e6; - } - else { - U_values(ipos) = 1.0 / U_values(ipos); - } -#endif - } - }); - //} - } - - // nnz_lno_t team_shmem_size(int /* team_size */) const { - // return shmem_size; - //} -}; +//template +//struct ILUKLvlSchedTP1HashMapNumericFunctor { +// using execution_space = typename ARowMapType::execution_space; +// using policy_type = Kokkos::TeamPolicy; +// using member_type = typename policy_type::member_type; +// using size_type = typename ARowMapType::non_const_value_type; +// using scalar_t = typename AValuesType::non_const_value_type; +// using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator< +// nnz_lno_t, nnz_lno_t, nnz_lno_t, +// KokkosKernels::Experimental::HashOpType::bitwiseAnd>; +// +// ARowMapType A_row_map; +// AEntriesType A_entries; +// AValuesType A_values; +// LRowMapType L_row_map; +// LEntriesType L_entries; +// LValuesType L_values; +// URowMapType U_row_map; +// UEntriesType U_entries; +// UValuesType U_values; +// LevelViewType level_idx; +// nnz_lno_t lev_start; +// nnz_lno_t shmem_hash_size; +// nnz_lno_t shmem_key_size; +// nnz_lno_t shared_memory_hash_func; +// nnz_lno_t shmem_size; +// +// ILUKLvlSchedTP1HashMapNumericFunctor( +// const ARowMapType &A_row_map_, const AEntriesType &A_entries_, +// const AValuesType &A_values_, const LRowMapType &L_row_map_, +// const LEntriesType &L_entries_, LValuesType &L_values_, +// const URowMapType &U_row_map_, const UEntriesType &U_entries_, +// UValuesType &U_values_, const LevelViewType &level_idx_, +// const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, +// const nnz_lno_t &shmem_key_size_, +// const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) +// : A_row_map(A_row_map_), +// A_entries(A_entries_), +// A_values(A_values_), +// L_row_map(L_row_map_), +// L_entries(L_entries_), +// L_values(L_values_), +// U_row_map(U_row_map_), +// U_entries(U_entries_), +// U_values(U_values_), +// level_idx(level_idx_), +// lev_start(lev_start_), +// shmem_hash_size(shmem_hash_size_), +// shmem_key_size(shmem_key_size_), +// shared_memory_hash_func(shared_memory_hash_func_), +// shmem_size(shmem_size_) {} +// +// KOKKOS_INLINE_FUNCTION +// void operator()(const member_type &team) const { +// auto my_league = team.league_rank(); // teamid +// auto rowid = level_idx(my_league + lev_start); // teamid-->rowid +// // auto my_team = team.team_rank(); +// +// // START shared hash map initialization +// char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); +// +// // Threads in a team share 4 arrays: begin, next, keys, values +// // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd +// // level hash right now) +// volatile nnz_lno_t *used_hash_sizes = +// (volatile nnz_lno_t *)(all_shared_memory); +// all_shared_memory += sizeof(nnz_lno_t) * 2; +// +// // points to begin array +// nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); +// all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; +// +// // points to the next elements +// nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); +// all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; +// +// // holds the keys and vals +// nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); +// all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; +// nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory); +// +// hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, +// keys, vals); +// +// // initialize begins +// Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), +// [&](int i) { begins[i] = -1; }); +// +// // initialize hash usage sizes +// Kokkos::single(Kokkos::PerTeam(team), [&]() { +// used_hash_sizes[0] = 0; +// used_hash_sizes[1] = 0; +// }); +// +// team.team_barrier(); +// // Shared hash map initialization DONE +// +// auto k1 = L_row_map(rowid); +// auto k2 = L_row_map(rowid + 1); +//#ifdef KEEP_DIAG +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { +// nnz_lno_t col = static_cast(L_entries(k)); +// L_values(k) = 0.0; +// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( +// col, k, used_hash_sizes); +// }); +//#else +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { +// nnz_lno_t col = static_cast(L_entries(k)); +// L_values(k) = 0.0; +// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( +// col, k, used_hash_sizes); +// }); +//#endif +// +//#ifdef KEEP_DIAG +// // if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); +// Kokkos::single(Kokkos::PerTeam(team), +// [&]() { L_values(k2 - 1) = scalar_t(1.0); }); +//#endif +// +// team.team_barrier(); +// +// k1 = U_row_map(rowid); +// k2 = U_row_map(rowid + 1); +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { +// nnz_lno_t col = static_cast(U_entries(k)); +// U_values(k) = 0.0; +// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( +// col, k, used_hash_sizes); +// }); +// +// // Kokkos::single(Kokkos::PerTeam(team),[&] () { +// // if (temp_nnz_cnt > shmem_key_size) +// // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, +// // shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, +// // shmem_key_size); +// //}); +// +// team.team_barrier(); +// +// // Unpack the ith row of A +// k1 = A_row_map(rowid); +// k2 = A_row_map(rowid + 1); +// Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), +// [&](const nnz_lno_t k) { +// nnz_lno_t col = static_cast(A_entries(k)); +// nnz_lno_t hashmap_idx = hm.find(col); +// if (hashmap_idx != -1) { +// nnz_lno_t ipos = hm.values[hashmap_idx]; +// if (col < rowid) +// L_values(ipos) = A_values(k); +// else +// U_values(ipos) = A_values(k); +// } +// }); +// +// team.team_barrier(); +// +// // Eliminate prev rows +// k1 = L_row_map(rowid); +// k2 = L_row_map(rowid + 1); +//#ifdef KEEP_DIAG +// for (auto k = k1; k < k2 - 1; ++k) +//#else +// for (auto k = k1; k < k2; ++k) +//#endif +// { +// auto prev_row = L_entries(k); +//#ifdef KEEP_DIAG +// auto fact = L_values(k) / U_values(U_row_map(prev_row)); +//#else +// auto fact = L_values(k) * U_values(U_row_map(prev_row)); +//#endif +// // if ( my_team == 0 ) L_values(k) = fact; +// Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); +// +// team.team_barrier(); +// +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, +// U_row_map(prev_row + 1)), +// [&](const size_type kk) { +// nnz_lno_t col = static_cast(U_entries(kk)); +// nnz_lno_t hashmap_idx = hm.find(col); +// if (hashmap_idx != -1) { +// nnz_lno_t ipos = hm.values[hashmap_idx]; +// auto lxu = -U_values(kk) * fact; +// if (col < rowid) +// // L_values(ipos) += lxu; +// Kokkos::atomic_add(&L_values(ipos), lxu); +// else +// // U_values(ipos) += lxu; +// Kokkos::atomic_add(&U_values(ipos), lxu); +// } +// }); // end for kk +// +// team.team_barrier(); +// } // end for k +// +// // if ( my_team == 0 ) { +// Kokkos::single(Kokkos::PerTeam(team), [&]() { +// nnz_lno_t hashmap_idx = hm.find(rowid); +// if (hashmap_idx != -1) { +// nnz_lno_t ipos = hm.values[hashmap_idx]; +//#ifdef KEEP_DIAG +// if (U_values(ipos) == 0.0) { +// U_values(ipos) = 1e6; +// } +//#else +// if (U_values(ipos) == 0.0) { +// U_values(ipos) = 1e6; +// } +// else { +// U_values(ipos) = 1.0 / U_values(ipos); +// } +//#endif +// } +// }); +// //} +// } +// +// // nnz_lno_t team_shmem_size(int /* team_size */) const { +// // return shmem_size; +// //} +//}; template #include -#include - //#define SYMBOLIC_OUTPUT_INFO namespace KokkosSparse { @@ -201,19 +199,6 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, lnrowsperchunk(i) = (lnrows % lnchunks(i) == 0) ? (lnrows / lnchunks(i)) : (lnrows / lnchunks(i) + 1); - if ((i < 10) || (i >= nlevels - 10)) - printf( - "Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, " - "nchunks %d, rows per chunk %d\n", - i, lnrows, nrows, required_size, avail_byte, lnchunks(i), - lnrowsperchunk(i)); - // if (lnrows == 312) - if (lnrows > 250) - printf( - "Level %d, lnrows %d, nrows %d, required size %ld, avail_byte %ld, " - "nchunks %d, rows per chunk %d\n", - i, lnrows, nrows, required_size, avail_byte, lnchunks(i), - lnrowsperchunk(i)); } else #endif { @@ -228,12 +213,9 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - - printf("nlevels %d, maxrows %d, maxrowsperchunk %d\n", nlevels, maxrows, - maxrowsperchunk); } -template void level_sched_hashmap(IlukHandle& thandle, const LRowMapType L_row_map, @@ -354,7 +336,7 @@ void level_sched_hashmap(IlukHandle& thandle, const LRowMapType L_row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); -} +}*/ // Linear Search for the smallest row index template @@ -398,10 +380,10 @@ void iluk_symbolic(IlukHandle& thandle, if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP || thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1 || - thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) /* || thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) + || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/ { // Scheduling and symbolic phase currently compute on host - need host copy @@ -461,9 +443,6 @@ void iluk_symbolic(IlukHandle& thandle, using HostTmpViewType = Kokkos::View; - struct timeval begin, end; // VINH TEST - gettimeofday(&begin, NULL); - HostTmpViewType h_lev("h_lev", thandle.get_nnzU()); HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); @@ -596,13 +575,7 @@ void iluk_symbolic(IlukHandle& thandle, thandle.set_nnzL(cntL); thandle.set_nnzU(cntU); - gettimeofday(&end, NULL); - printf(" VINH TEST: symbolic -- main %.8lf (sec.)\n", - 1.0 * (end.tv_sec - begin.tv_sec) + - 1.0e-6 * (end.tv_usec - begin.tv_usec)); - // Sort - gettimeofday(&begin, NULL); for (size_type row_id = 0; row_id < static_cast(L_row_map.extent(0)) - 1; row_id++) { size_type row_start = L_row_map(row_id); @@ -615,26 +588,16 @@ void iluk_symbolic(IlukHandle& thandle, size_type row_end = U_row_map(row_id + 1); Kokkos::sort(subview(U_entries, Kokkos::make_pair(row_start, row_end))); } - gettimeofday(&end, NULL); - printf(" VINH TEST: symbolic -- sort %.8lf (sec.)\n", - 1.0 * (end.tv_sec - begin.tv_sec) + - 1.0e-6 * (end.tv_usec - begin.tv_usec)); // Level scheduling on L - gettimeofday(&begin, NULL); - if (thandle.get_algorithm() == + /*if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, level_list, level_ptr, level_idx, nlev); - } else if (thandle.get_algorithm() == + } else*/ if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - printf("LEVEL SCHED on L\n"); level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, nlev); // ORIG - // Level scheduling on A??? - // printf ("LEVEL SCHED on A\n"); - // level_sched (thandle, A_row_map, A_entries, level_list, level_ptr, - // level_idx, nlev); + level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, @@ -652,10 +615,6 @@ void iluk_symbolic(IlukHandle& thandle, Kokkos::deep_copy(U_entries_d, U_entries); thandle.set_symbolic_complete(); - gettimeofday(&end, NULL); - printf(" VINH TEST: symbolic -- sched + copy %.8lf (sec.)\n", - 1.0 * (end.tv_sec - begin.tv_sec) + - 1.0e-6 * (end.tv_usec - begin.tv_usec)); // Output check #ifdef SYMBOLIC_OUTPUT_INFO From 0bd6d922e3a38b041f61882bf86475a4f2861857 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 21 Jul 2022 16:29:09 -0600 Subject: [PATCH 243/261] Sparse: bsr transpose algorithm Adding naive implementation of the transpose of a BsrMatrix after computing the transpose of its graph. Signed-off-by: Luc Berger-Vergiat --- src/sparse/KokkosSparse_Utils.hpp | 126 +++++++++++++- unit_test/sparse/Test_Sparse_Transpose.hpp | 187 ++++++++++++++++++++- 2 files changed, 303 insertions(+), 10 deletions(-) diff --git a/src/sparse/KokkosSparse_Utils.hpp b/src/sparse/KokkosSparse_Utils.hpp index db656c959b..007b2aea85 100644 --- a/src/sparse/KokkosSparse_Utils.hpp +++ b/src/sparse/KokkosSparse_Utils.hpp @@ -293,17 +293,17 @@ struct TransposeMatrix { struct CountTag {}; struct FillTag {}; - typedef Kokkos::TeamPolicy team_count_policy_t; - typedef Kokkos::TeamPolicy team_fill_policy_t; + using team_count_policy_t = Kokkos::TeamPolicy; + using team_fill_policy_t = Kokkos::TeamPolicy; - typedef typename team_count_policy_t::member_type team_count_member_t; - typedef typename team_fill_policy_t::member_type team_fill_member_t; + using team_count_member_t = typename team_count_policy_t::member_type; + using team_fill_member_t = typename team_fill_policy_t::member_type; - typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t; - typedef typename in_row_view_t::non_const_value_type size_type; + using nnz_lno_t = typename in_nnz_view_t::non_const_value_type; + using size_type = typename in_row_view_t::non_const_value_type; - typename in_nnz_view_t::non_const_value_type num_rows; - typename in_nnz_view_t::non_const_value_type num_cols; + nnz_lno_t num_rows; + nnz_lno_t num_cols; in_row_view_t xadj; in_nnz_view_t adj; in_scalar_view_t vals; @@ -539,6 +539,116 @@ void transpose_graph( MyExecSpace().fence(); } +template +struct TransposeBsrMatrix { + using ordinal_type = typename in_nnz_view_t::non_const_value_type; + using size_type = typename in_row_view_t::non_const_value_type; + + int block_size; + in_row_view_t Arow_map; + in_nnz_view_t Aentries; + in_scalar_view_t Avalues; + out_row_view_t tArow_map; // allocated + out_nnz_view_t tAentries; // allocated + out_scalar_view_t tAvalues; // allocated + + TransposeBsrMatrix(const int blockSize, in_row_view_t row_mapA, + in_nnz_view_t entriesA, in_scalar_view_t valuesA, + out_row_view_t row_mapAt, out_nnz_view_t entriesAt, + out_scalar_view_t valuesAt) + : block_size(blockSize), + Arow_map(row_mapA), + Aentries(entriesA), + Avalues(valuesA), + tArow_map(row_mapAt), + tAentries(entriesAt), + tAvalues(valuesAt){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const int tArowIdx) const { + // Loop over entries in row + for (size_type tAentryIdx = tArow_map(tArowIdx); + tAentryIdx < tArow_map(tArowIdx + 1); ++tAentryIdx) { + ordinal_type tAcolIdx = tAentries(tAentryIdx); + + // we have block tA(tArowIdx, tAcolIdx) starting at tAvalues(entryIdx) + // we need to find AentryIdx corresponding to A(tAcolIdx, tArowIdx) + size_type AentryIdx; + for (AentryIdx = Arow_map(tAcolIdx); AentryIdx < Arow_map(tAcolIdx + 1); + ++AentryIdx) { + if (tArowIdx == Aentries(AentryIdx)) break; + } + + // we loop over block_size*block_size Avalues starting at AentryIdx + // and store them into tAvalues in transpose order starting at tAentryIdx + for (int i = 0; i < block_size; ++i) { + for (int j = 0; j < block_size; ++j) { + tAvalues(tAentryIdx * block_size * block_size + i * block_size + j) = + Avalues(AentryIdx * block_size * block_size + j * block_size + i); + } + } + } + } +}; // TransposeBsrMatrix + +template +void transpose_bsr_matrix( + typename in_nnz_view_t::non_const_value_type num_rows, + typename in_nnz_view_t::non_const_value_type num_cols, const int block_size, + in_row_view_t xadj, in_nnz_view_t adj, in_scalar_view_t vals, + out_row_view_t t_xadj, // pre-allocated -- initialized with 0 + out_nnz_view_t t_adj, // pre-allocated -- no need for initialize + out_scalar_view_t t_vals // pre-allocated -- no need for initialize +) { + using TransposeBsrFunctor_type = + TransposeBsrMatrix; + + // Step 1: call transpose_graph of bsr matrix + transpose_graph(num_rows, num_cols, xadj, adj, + t_xadj, t_adj); + + // Step 2: transpose the values of A + Kokkos::RangePolicy my_policy(0, num_cols); + TransposeBsrFunctor_type my_functor(block_size, xadj, adj, vals, t_xadj, + t_adj, t_vals); + + Kokkos::parallel_for(my_policy, my_functor); + MyExecSpace().fence(); +} + +template +bsrMat_t transpose_bsr_matrix(const bsrMat_t &A) { + // Allocate views and call the other version of transpose_matrix + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using c_values_t = typename bsrMat_t::values_type; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1); + entries_t AT_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"), + A.nnz()); + values_t AT_values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"), + A.nnz() * A.blockDim() * A.blockDim()); + transpose_bsr_matrix( + A.numRows(), A.numCols(), A.blockDim(), A.graph.row_map, A.graph.entries, + A.values, AT_rowmap, AT_entries, AT_values); + // And construct the transpose crsMat_t + return bsrMat_t("Transpose", A.numCols(), A.numRows(), A.nnz(), AT_values, + AT_rowmap, AT_entries, A.blockDim()); +} + template struct Fill_Reverse_Scale_Functor { struct CountTag {}; diff --git a/unit_test/sparse/Test_Sparse_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp index 530614eace..77868a7251 100644 --- a/unit_test/sparse/Test_Sparse_Transpose.hpp +++ b/unit_test/sparse/Test_Sparse_Transpose.hpp @@ -152,7 +152,179 @@ void testTranspose(int numRows, int numCols, bool doValues) { } } -TEST_F(TestCategory, common_transpose_matrix) { +template +void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { + using exec_space = typename bsrMat_t::execution_space; + using range_pol = Kokkos::RangePolicy; + using size_type = default_size_type; + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + // The views should now be exactly identical, since they represent the same + // matrix and are sorted + + size_type rowmapDiffs; + Kokkos::parallel_reduce( + range_pol(0, A.numRows() + 1), + ExactCompare(A.graph.row_map, B.graph.row_map), + rowmapDiffs); + + size_type entriesDiffs; + Kokkos::parallel_reduce( + range_pol(0, A.nnz()), + ExactCompare(A.graph.entries, B.graph.entries), + entriesDiffs); + + EXPECT_EQ(size_type(0), rowmapDiffs); + EXPECT_EQ(size_type(0), entriesDiffs); + + size_type valuesDiffs; + Kokkos::parallel_reduce(range_pol(0, A.nnz() * A.blockDim() * A.blockDim()), + ExactCompare(A.values, B.values), + valuesDiffs); + EXPECT_EQ(size_type(0), valuesDiffs); +} + +template +void testTransposeBsrRef() { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using bsrMat_t = + typename KokkosSparse::Experimental::BsrMatrix; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + const int numRows = 4; + const int nnz = 7; + const int block_size = 2; + + // Coming up with a BsrMatrix + bsrMat_t A; + { + rowmap_t row_map("row map", numRows + 1); + entries_t entries("entries", nnz); + values_t values("values", nnz * block_size * block_size); + + const size_type row_mapPtr[] = {0, 2, 3, 5, 7}; + const lno_t entriesPtr[] = {2, 3, 1, 0, 1, 1, 3}; + const scalar_t valuesPtr[] = { + 0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3, 2.0, 2.1, 2.2, 2.3, 3.0, 3.1, + 3.2, 3.3, 4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3, 6.0, 6.1, 6.2, 6.3}; + + typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr, + numRows + 1); + typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz); + typename values_t::HostMirror::const_type values_h( + valuesPtr, nnz * block_size * block_size); + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + + A = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries, + block_size); + } + + // Constructing the transpose of A manually + bsrMat_t At_ref; + { + rowmap_t row_map("row map", numRows + 1); + entries_t entries("entries", nnz); + values_t values("values", nnz * block_size * block_size); + + const size_type row_mapPtr[] = {0, 1, 4, 5, 7}; + const lno_t entriesPtr[] = {2, 1, 2, 3, 0, 0, 3}; + const scalar_t valuesPtr[] = { + 3.0, 3.2, 3.1, 3.3, 2.0, 2.2, 2.1, 2.3, 4.0, 4.2, 4.1, 4.3, 5.0, 5.2, + 5.1, 5.3, 0.0, 0.2, 0.1, 0.3, 1.0, 1.2, 1.1, 1.3, 6.0, 6.2, 6.1, 6.3}; + + typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr, + numRows + 1); + typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz); + typename values_t::HostMirror::const_type values_h( + valuesPtr, nnz * block_size * block_size); + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + + At_ref = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries, + block_size); + } + + bsrMat_t At = KokkosSparse::Impl::transpose_bsr_matrix(A); + KokkosSparse::sort_bsr_matrix(At); + + CompareBsrMatrices(At, At_ref); +} + +template +void testTransposeBsr(int numRows, int numCols, int blockSize) { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using bsrMat_t = + typename KokkosSparse::Experimental::BsrMatrix; + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using c_values_t = typename bsrMat_t::values_type; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + // Generate a matrix that has 0 entries in some rows + size_type nnz = 10 * numRows; + bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + blockSize, numRows, numCols, nnz, 3, numRows / 4); + + // compute the transpose while unsorted, then transpose again + rowmap_t t_rowmap("Rowmap^T", numCols + 1); // this view is initialized to 0 + entries_t t_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"), + A.graph.entries.extent(0)); + values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + A.values.extent(0)); + rowmap_t tt_rowmap("Rowmap^T^T", + numRows + 1); // this view is initialized to 0 + entries_t tt_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"), + A.graph.entries.extent(0)); + values_t tt_values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + A.values.extent(0)); + + KokkosSparse::Impl::transpose_bsr_matrix( + numRows, numCols, blockSize, A.graph.row_map, A.graph.entries, A.values, + t_rowmap, t_entries, t_values); + + KokkosSparse::Impl::transpose_bsr_matrix< + rowmap_t, entries_t, values_t, rowmap_t, entries_t, values_t, exec_space>( + numCols, numRows, blockSize, t_rowmap, t_entries, t_values, tt_rowmap, + tt_entries, tt_values); + bsrMat_t Att("Att", numRows, numCols, nnz, tt_values, tt_rowmap, tt_entries, + blockSize); + + // Sort both the transpose-transpose, and the original matrix (to compare + // directly) + KokkosSparse::sort_bsr_matrix(A); + + KokkosSparse::sort_bsr_matrix(Att); + + CompareBsrMatrices(A, Att); +} + +TEST_F(TestCategory, sparse_transpose_matrix) { // Test both matrix and graph transpose with various sizes testTranspose(100, 100, true); testTranspose(500, 50, true); @@ -162,7 +334,7 @@ TEST_F(TestCategory, common_transpose_matrix) { testTranspose(2000, 2000, true); } -TEST_F(TestCategory, common_transpose_graph) { +TEST_F(TestCategory, sparse_transpose_graph) { testTranspose(100, 100, false); testTranspose(500, 50, false); testTranspose(50, 500, false); @@ -171,4 +343,15 @@ TEST_F(TestCategory, common_transpose_graph) { testTranspose(2000, 2000, false); } +TEST_F(TestCategory, sparse_transpose_bsr_matrix) { + testTransposeBsrRef(); + // Test bsrMatrix transpose with various sizes + testTransposeBsr(100, 100, 3); + testTransposeBsr(500, 50, 5); + testTransposeBsr(50, 500, 16); + testTransposeBsr(4000, 2000, 3); + testTransposeBsr(2000, 4000, 3); + testTransposeBsr(2000, 2000, 5); +} + #endif From 103f3a89a67b2aed12d0329cd135cb0cbbb878cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 25 Jul 2022 12:19:30 +0200 Subject: [PATCH 244/261] fix: connect MKL headers in CMake --- cmake/Modules/FindTPLMKL.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 5766e0f5b0..56f4f34c9e 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -41,6 +41,10 @@ ELSE() LIBRARY_PATHS ${MKL_ROOT}/lib/intel64 ${ENV_LIBDIRS} + HEADER + mkl.h + HEADER_PATHS + ${MKL_ROOT}/include ) ENDIF() ENDIF() From e4ae48bf1ec85af08bbc6f8f51a8af7bf1b93a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Fri, 22 Jul 2022 12:32:50 +0200 Subject: [PATCH 245/261] fix MKL pointer casts --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 24 +++++++++---------- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 8 +++---- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index d0ea5cdc26..93457f9837 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -112,9 +112,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); - matrix_descr A_descr = getDescription(); + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; + matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); @@ -133,9 +133,9 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - matrix_descr A_descr = getDescription(); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); + matrix_descr A_descr = getDescription(); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); @@ -189,9 +189,9 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); - matrix_descr A_descr = getDescription(); + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; + matrix_descr A_descr = getDescription(); KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, @@ -210,9 +210,9 @@ inline void spm_mv_block_impl_mkl( const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - matrix_descr A_descr = getDescription(); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); + matrix_descr A_descr = getDescription(); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 868d8ec047..b4c73a12ff 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -583,8 +583,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); @@ -605,8 +605,8 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); From f72b456bf836ca6bf72da25dabe421ba4c3e0e23 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 26 Jul 2022 08:53:23 -0700 Subject: [PATCH 246/261] Clean up --- src/sparse/KokkosSparse_spiluk_handle.hpp | 52 +--- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 283 +----------------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 131 +------- 3 files changed, 6 insertions(+), 460 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index e449b97057..2b220b091b 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -60,7 +60,6 @@ namespace Experimental { enum class SPILUKAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1 /*, SEQLVLSCHED_TP2*/ - //SEQLVLSCHD_TP1HASHMAP }; template -//struct ILUKLvlSchedTP1HashMapNumericFunctor { -// using execution_space = typename ARowMapType::execution_space; -// using policy_type = Kokkos::TeamPolicy; -// using member_type = typename policy_type::member_type; -// using size_type = typename ARowMapType::non_const_value_type; -// using scalar_t = typename AValuesType::non_const_value_type; -// using hashmap_type = KokkosKernels::Experimental::HashmapAccumulator< -// nnz_lno_t, nnz_lno_t, nnz_lno_t, -// KokkosKernels::Experimental::HashOpType::bitwiseAnd>; -// -// ARowMapType A_row_map; -// AEntriesType A_entries; -// AValuesType A_values; -// LRowMapType L_row_map; -// LEntriesType L_entries; -// LValuesType L_values; -// URowMapType U_row_map; -// UEntriesType U_entries; -// UValuesType U_values; -// LevelViewType level_idx; -// nnz_lno_t lev_start; -// nnz_lno_t shmem_hash_size; -// nnz_lno_t shmem_key_size; -// nnz_lno_t shared_memory_hash_func; -// nnz_lno_t shmem_size; -// -// ILUKLvlSchedTP1HashMapNumericFunctor( -// const ARowMapType &A_row_map_, const AEntriesType &A_entries_, -// const AValuesType &A_values_, const LRowMapType &L_row_map_, -// const LEntriesType &L_entries_, LValuesType &L_values_, -// const URowMapType &U_row_map_, const UEntriesType &U_entries_, -// UValuesType &U_values_, const LevelViewType &level_idx_, -// const nnz_lno_t &lev_start_, const nnz_lno_t &shmem_hash_size_, -// const nnz_lno_t &shmem_key_size_, -// const nnz_lno_t &shared_memory_hash_func_, const nnz_lno_t &shmem_size_) -// : A_row_map(A_row_map_), -// A_entries(A_entries_), -// A_values(A_values_), -// L_row_map(L_row_map_), -// L_entries(L_entries_), -// L_values(L_values_), -// U_row_map(U_row_map_), -// U_entries(U_entries_), -// U_values(U_values_), -// level_idx(level_idx_), -// lev_start(lev_start_), -// shmem_hash_size(shmem_hash_size_), -// shmem_key_size(shmem_key_size_), -// shared_memory_hash_func(shared_memory_hash_func_), -// shmem_size(shmem_size_) {} -// -// KOKKOS_INLINE_FUNCTION -// void operator()(const member_type &team) const { -// auto my_league = team.league_rank(); // teamid -// auto rowid = level_idx(my_league + lev_start); // teamid-->rowid -// // auto my_team = team.team_rank(); -// -// // START shared hash map initialization -// char *all_shared_memory = (char *)(team.team_shmem().get_shmem(shmem_size)); -// -// // Threads in a team share 4 arrays: begin, next, keys, values -// // used_hash_sizes hold the size of 1st and 2nd level hashes (not using 2nd -// // level hash right now) -// volatile nnz_lno_t *used_hash_sizes = -// (volatile nnz_lno_t *)(all_shared_memory); -// all_shared_memory += sizeof(nnz_lno_t) * 2; -// -// // points to begin array -// nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); -// all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; -// -// // points to the next elements -// nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); -// all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; -// -// // holds the keys and vals -// nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); -// all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; -// nnz_lno_t *vals = (nnz_lno_t *)(all_shared_memory); -// -// hashmap_type hm(shmem_key_size, shared_memory_hash_func, begins, nexts, -// keys, vals); -// -// // initialize begins -// Kokkos::parallel_for(Kokkos::TeamThreadRange(team, shmem_hash_size), -// [&](int i) { begins[i] = -1; }); -// -// // initialize hash usage sizes -// Kokkos::single(Kokkos::PerTeam(team), [&]() { -// used_hash_sizes[0] = 0; -// used_hash_sizes[1] = 0; -// }); -// -// team.team_barrier(); -// // Shared hash map initialization DONE -// -// auto k1 = L_row_map(rowid); -// auto k2 = L_row_map(rowid + 1); -//#ifdef KEEP_DIAG -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const nnz_lno_t k) { -// nnz_lno_t col = static_cast(L_entries(k)); -// L_values(k) = 0.0; -// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( -// col, k, used_hash_sizes); -// }); -//#else -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { -// nnz_lno_t col = static_cast(L_entries(k)); -// L_values(k) = 0.0; -// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( -// col, k, used_hash_sizes); -// }); -//#endif -// -//#ifdef KEEP_DIAG -// // if ( my_team == 0 ) L_values(k2-1) = scalar_t(1.0); -// Kokkos::single(Kokkos::PerTeam(team), -// [&]() { L_values(k2 - 1) = scalar_t(1.0); }); -//#endif -// -// team.team_barrier(); -// -// k1 = U_row_map(rowid); -// k2 = U_row_map(rowid + 1); -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team, k1, k2), [&](const nnz_lno_t k) { -// nnz_lno_t col = static_cast(U_entries(k)); -// U_values(k) = 0.0; -// int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeOr( -// col, k, used_hash_sizes); -// }); -// -// // Kokkos::single(Kokkos::PerTeam(team),[&] () { -// // if (temp_nnz_cnt > shmem_key_size) -// // printf("VINHVINH teamid %d, rowid %d (at level %d), temp_nnz_cnt %d, -// // shmem_key_size %d\n", my_league, rowid, lvl+1, temp_nnz_cnt, -// // shmem_key_size); -// //}); -// -// team.team_barrier(); -// -// // Unpack the ith row of A -// k1 = A_row_map(rowid); -// k2 = A_row_map(rowid + 1); -// Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), -// [&](const nnz_lno_t k) { -// nnz_lno_t col = static_cast(A_entries(k)); -// nnz_lno_t hashmap_idx = hm.find(col); -// if (hashmap_idx != -1) { -// nnz_lno_t ipos = hm.values[hashmap_idx]; -// if (col < rowid) -// L_values(ipos) = A_values(k); -// else -// U_values(ipos) = A_values(k); -// } -// }); -// -// team.team_barrier(); -// -// // Eliminate prev rows -// k1 = L_row_map(rowid); -// k2 = L_row_map(rowid + 1); -//#ifdef KEEP_DIAG -// for (auto k = k1; k < k2 - 1; ++k) -//#else -// for (auto k = k1; k < k2; ++k) -//#endif -// { -// auto prev_row = L_entries(k); -//#ifdef KEEP_DIAG -// auto fact = L_values(k) / U_values(U_row_map(prev_row)); -//#else -// auto fact = L_values(k) * U_values(U_row_map(prev_row)); -//#endif -// // if ( my_team == 0 ) L_values(k) = fact; -// Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); -// -// team.team_barrier(); -// -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, -// U_row_map(prev_row + 1)), -// [&](const size_type kk) { -// nnz_lno_t col = static_cast(U_entries(kk)); -// nnz_lno_t hashmap_idx = hm.find(col); -// if (hashmap_idx != -1) { -// nnz_lno_t ipos = hm.values[hashmap_idx]; -// auto lxu = -U_values(kk) * fact; -// if (col < rowid) -// // L_values(ipos) += lxu; -// Kokkos::atomic_add(&L_values(ipos), lxu); -// else -// // U_values(ipos) += lxu; -// Kokkos::atomic_add(&U_values(ipos), lxu); -// } -// }); // end for kk -// -// team.team_barrier(); -// } // end for k -// -// // if ( my_team == 0 ) { -// Kokkos::single(Kokkos::PerTeam(team), [&]() { -// nnz_lno_t hashmap_idx = hm.find(rowid); -// if (hashmap_idx != -1) { -// nnz_lno_t ipos = hm.values[hashmap_idx]; -//#ifdef KEEP_DIAG -// if (U_values(ipos) == 0.0) { -// U_values(ipos) = 1e6; -// } -//#else -// if (U_values(ipos) == 0.0) { -// U_values(ipos) = 1e6; -// } -// else { -// U_values(ipos) = 1.0 / U_values(ipos); -// } -//#endif -// } -// }); -// //} -// } -// -// // nnz_lno_t team_shmem_size(int /* team_size */) const { -// // return shmem_size; -// //} -//}; - template ; - - nnz_lno_t shmem_hash_size = - static_cast(level_shmem_hash_size(lvl)); - nnz_lno_t shmem_key_size = - static_cast(level_shmem_key_size(lvl)); - - nnz_lno_t shared_memory_hash_func = - shmem_hash_size - 1; // for AND operation we use -1 - - // shmem needs the first 2 entries for sizes - nnz_lno_t shmem_size = - (2 + shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); - - int team_size = thandle.get_team_size(); - ILUKLvlSchedTP1HashMapNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, - LValuesType, URowMapType, UEntriesType, UValuesType, - HandleDeviceEntriesType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, - U_row_map, U_entries, U_values, level_idx, lev_start, - shmem_hash_size, shmem_key_size, shared_memory_hash_func, - shmem_size); - if (team_size == -1) { - policy_type team_policy(lev_end - lev_start, Kokkos::AUTO); - team_policy.set_scratch_size(0, Kokkos::PerTeam(shmem_size)); - Kokkos::parallel_for("parfor_l_team", team_policy, tstf); - } else { - policy_type team_policy(lev_end - lev_start, team_size); - team_policy.set_scratch_size(0, Kokkos::PerTeam(shmem_size)); - Kokkos::parallel_for("parfor_l_team", team_policy, tstf); - } - } // end if - } // end for lvl - } // End SEQLVLSCHD_TP1HASHMAP - else*/ - { + //{ if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_nchunks_h = thandle.get_level_nchunks(); @@ -763,7 +484,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, } } // end if } // end for lvl - } + //} // Output check #ifdef NUMERIC_OUTPUT_INFO diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 3251ae93d1..2f7ce73e37 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -215,129 +215,6 @@ void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, thandle.set_level_maxrowsperchunk(maxrowsperchunk); } -/*template -void level_sched_hashmap(IlukHandle& thandle, const LRowMapType L_row_map, - const LEntriesType L_entries, - const URowMapType U_row_map, - const UEntriesType U_entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, - size_type& nlevels) { - // Scheduling currently compute on host - - using nnz_lno_t = typename IlukHandle::nnz_lno_t; - - size_type nrows = thandle.get_nrows(); - - nlevels = 0; - level_ptr(0) = 0; - - for (size_type i = 0; i < nrows; ++i) { - size_type l = 0; - size_type rowstart = L_row_map(i); - size_type rowend = L_row_map(i + 1); - for (size_type j = rowstart; j < rowend; ++j) { - nnz_lno_t col = L_entries(j); - l = std::max(l, level_list(col)); - } - level_list(i) = l + 1; - level_ptr(l + 1) += 1; - nlevels = std::max(nlevels, l + 1); - } - - for (size_type i = 1; i <= nlevels; ++i) { - level_ptr(i) += level_ptr(i - 1); - } - - for (size_type i = 0; i < nrows; i++) { - level_idx(level_ptr(level_list(i) - 1)) = i; - level_ptr(level_list(i) - 1) += 1; - } - - if (nlevels > 0) { // note: to avoid wrapping around to the max of size_t - // when nlevels = 0. - for (size_type i = nlevels - 1; i > 0; --i) { - level_ptr(i) = level_ptr(i - 1); - } - } - - level_ptr(0) = 0; - - // Find the maximum number of nnz per row per level - // Determine shmem hash size and key size - //(max. number of non-zeros in both L and U) - size_type maxrows = 0; - - thandle.alloc_level_maxnnzperrow(nlevels); - thandle.alloc_level_shmem_hash_size(nlevels); - thandle.alloc_level_shmem_key_size(nlevels); - - auto level_maxnnzperrow = thandle.get_level_maxnnzperrow(); - auto level_shmem_hash_size = thandle.get_level_shmem_hash_size(); - auto level_shmem_key_size = thandle.get_level_shmem_key_size(); - - for (size_type i = 0; i < nlevels; i++) { - size_type lnrows = level_ptr(i + 1) - level_ptr(i); - if (maxrows < lnrows) { - maxrows = lnrows; - } - // Determine the number of non-zeros in each level - size_type r_s = level_ptr(i); - size_type r_e = level_ptr(i + 1); - size_type lnnz = 0; - size_type lmaxnnz = 0; - for (size_type r = r_s; r < r_e; r++) { // Look at each row in a level - auto rid = level_idx(r); // get actual rowid in the level - size_type rnnz = (L_row_map(rid + 1) - L_row_map(rid)) + - (U_row_map(rid + 1) - - U_row_map(rid)); // count the number of non-zeros in - // the current row (both L and U) - lnnz += rnnz; // accumulate to count the nnz in the current level - if (lmaxnnz < rnnz) { - lmaxnnz = rnnz; - } - } - level_maxnnzperrow(i) = lmaxnnz; - - size_type shmem_key_size = - lmaxnnz; // the number of keys can a team (row) hold - - // put the hash size closest power of 2. - // we round down here, because we want to store more keys, - // conflicts are cheaper. - size_type shmem_hash_size = 1; - while (shmem_hash_size * 2 <= shmem_key_size) { - shmem_hash_size = shmem_hash_size * 2; - } - - // increase the key size with the left over from hash size. - shmem_key_size = - shmem_key_size + (shmem_key_size - shmem_hash_size) / - 3; // note: divided by 3 because nexts, keys, - // values have sizes of shmem_key_size - // round it down to 2 and multiply by 2, because of some alignment issues. - shmem_key_size = (shmem_key_size >> 1) << 1; - - level_shmem_hash_size(i) = shmem_hash_size; - level_shmem_key_size(i) = shmem_key_size; - - // if ((i < 20)|| (i >= (nlevels-20))) { - // std::cout << "Level " << i+1 << " has " << level_ptr(i+1) - level_ptr(i) - // << " rows"; std::cout << ", maxnnzperrow: " << level_maxnnzperrow(i); - // std::cout << ", shmem_hash_size: " << level_shmem_hash_size(i); - // std::cout << ", shmem_key_size: " << level_shmem_key_size(i); - // std::cout << ", shared_memory_hash_func: " << - // level_shmem_hash_size(i)-1; std::cout << ", shmem_size: " << (2 + - // shmem_hash_size + shmem_key_size * 3) * sizeof(nnz_lno_t); std::cout << - // std::endl; - //} - } - - thandle.set_num_levels(nlevels); - thandle.set_level_maxrows(maxrows); -}*/ - // Linear Search for the smallest row index template size_type search_col_index(nnz_lno_t j, size_type lenl, ViewType h_iL, @@ -382,8 +259,6 @@ void iluk_symbolic(IlukHandle& thandle, thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) /* || thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) - || thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHED_TP2 )*/ { // Scheduling and symbolic phase currently compute on host - need host copy @@ -590,11 +465,7 @@ void iluk_symbolic(IlukHandle& thandle, } // Level scheduling on L - /*if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1HASHMAP) { - level_sched_hashmap(thandle, L_row_map, L_entries, U_row_map, U_entries, - level_list, level_ptr, level_idx, nlev); - } else*/ if (thandle.get_algorithm() == + if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); From 8b6c7b8d960c1abf3f7605c858ccb0cdaff00396 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 26 Jul 2022 10:08:02 -0700 Subject: [PATCH 247/261] Apply clang format --- src/sparse/KokkosSparse_spiluk_handle.hpp | 13 +- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 117 +++++++++--------- .../KokkosSparse_spiluk_symbolic_impl.hpp | 2 +- 3 files changed, 64 insertions(+), 68 deletions(-) diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 2b220b091b..54cc124474 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -160,13 +160,12 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_host_t(), - level_nrowsperchunk = nnz_lno_view_host_t(), - reset_symbolic_complete(), - iw = work_view_t(); + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_host_t(), + level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), + iw = work_view_t(); } virtual ~SPILUKHandle(){}; diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 6ec5283023..b7dffbe6ae 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -421,69 +421,66 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, Kokkos::deep_copy(level_ptr_h, level_ptr); //{ - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = thandle.get_level_nchunks(); - level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); - } - iw = thandle.get_iw(); - - // Main loop must be performed sequential. Question: Try out Cuda's graph - // stuff to reduce kernel launch overhead - for (size_type lvl = 0; lvl < nlevels; ++lvl) { - nnz_lno_t lev_start = level_ptr_h(lvl); - nnz_lno_t lev_end = level_ptr_h(lvl + 1); - - if ((lev_end - lev_start) != 0) { - if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { - Kokkos::parallel_for( - "parfor_fixed_lvl", - Kokkos::RangePolicy(lev_start, lev_end), - ILUKLvlSchedRPNumericFunctor< - ARowMapType, AEntriesType, AValuesType, LRowMapType, - LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, - nnz_lno_t>(A_row_map, A_entries, A_values, L_row_map, - L_entries, L_values, U_row_map, U_entries, - U_values, level_idx, iw, lev_start)); - } else if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm:: - SEQLVLSCHD_TP1) { - using policy_type = Kokkos::TeamPolicy; - int team_size = thandle.get_team_size(); - - nnz_lno_t lvl_rowid_start = 0; - nnz_lno_t lvl_nrows_chunk; - for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { - if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > - (lev_end - lev_start)) - lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; - else - lvl_nrows_chunk = level_nrowsperchunk_h(lvl); - - ILUKLvlSchedTP1NumericFunctor< + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); + } + iw = thandle.get_iw(); + + // Main loop must be performed sequential. Question: Try out Cuda's graph + // stuff to reduce kernel launch overhead + for (size_type lvl = 0; lvl < nlevels; ++lvl) { + nnz_lno_t lev_start = level_ptr_h(lvl); + nnz_lno_t lev_end = level_ptr_h(lvl + 1); + + if ((lev_end - lev_start) != 0) { + if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_RP) { + Kokkos::parallel_for( + "parfor_fixed_lvl", + Kokkos::RangePolicy(lev_start, lev_end), + ILUKLvlSchedRPNumericFunctor< ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, LValuesType, URowMapType, UEntriesType, - UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t> - tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, - L_values, U_row_map, U_entries, U_values, level_idx, iw, - lev_start + lvl_rowid_start); - - if (team_size == -1) - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, Kokkos::AUTO), - tstf); - else - Kokkos::parallel_for("parfor_l_team", - policy_type(lvl_nrows_chunk, team_size), - tstf); - Kokkos::fence(); - lvl_rowid_start += lvl_nrows_chunk; - } + UValuesType, HandleDeviceEntriesType, WorkViewType, nnz_lno_t>( + A_row_map, A_entries, A_values, L_row_map, L_entries, L_values, + U_row_map, U_entries, U_values, level_idx, iw, lev_start)); + } else if (thandle.get_algorithm() == + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + using policy_type = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); + + nnz_lno_t lvl_rowid_start = 0; + nnz_lno_t lvl_nrows_chunk; + for (int chunkid = 0; chunkid < level_nchunks_h(lvl); chunkid++) { + if ((lvl_rowid_start + level_nrowsperchunk_h(lvl)) > + (lev_end - lev_start)) + lvl_nrows_chunk = (lev_end - lev_start) - lvl_rowid_start; + else + lvl_nrows_chunk = level_nrowsperchunk_h(lvl); + + ILUKLvlSchedTP1NumericFunctor< + ARowMapType, AEntriesType, AValuesType, LRowMapType, LEntriesType, + LValuesType, URowMapType, UEntriesType, UValuesType, + HandleDeviceEntriesType, WorkViewType, nnz_lno_t> + tstf(A_row_map, A_entries, A_values, L_row_map, L_entries, + L_values, U_row_map, U_entries, U_values, level_idx, iw, + lev_start + lvl_rowid_start); + + if (team_size == -1) + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, Kokkos::AUTO), + tstf); + else + Kokkos::parallel_for("parfor_l_team", + policy_type(lvl_nrows_chunk, team_size), tstf); + Kokkos::fence(); + lvl_rowid_start += lvl_nrows_chunk; } - } // end if - } // end for lvl + } + } // end if + } // end for lvl //} // Output check diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 2f7ce73e37..691d624963 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -466,7 +466,7 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == - KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { + KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); From c150c0d838bf826d95894e4671ac964c870c8390 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 26 Jul 2022 12:53:56 -0700 Subject: [PATCH 248/261] Remove unused variables --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index b7dffbe6ae..4ef59db950 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -245,8 +245,8 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t my_team = static_cast(team.league_rank()); nnz_lno_t rowid = static_cast(level_idx(my_team + lev_start)); // map to rowid - nnz_lno_t my_thread = static_cast(team.team_rank()); - nnz_lno_t ts = static_cast(team.team_size()); + //nnz_lno_t my_thread = static_cast(team.team_rank()); + //nnz_lno_t ts = static_cast(team.team_size()); nnz_lno_t k1 = static_cast(L_row_map(rowid)); nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); @@ -402,7 +402,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; size_type nlevels = thandle.get_num_levels(); - size_type nrows = thandle.get_nrows(); + // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); @@ -489,7 +489,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; std::cout << " L_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << L_row_map(i) << " "; } std::cout << std::endl; @@ -508,7 +508,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; std::cout << " U_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << U_row_map(i) << " "; } std::cout << std::endl; From 0eb52ac4107ae80d85f78711b4bff28f3fcdb7b1 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 26 Jul 2022 13:11:28 -0700 Subject: [PATCH 249/261] Apply clang format --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index 4ef59db950..efc60e5ff0 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -245,8 +245,8 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t my_team = static_cast(team.league_rank()); nnz_lno_t rowid = static_cast(level_idx(my_team + lev_start)); // map to rowid - //nnz_lno_t my_thread = static_cast(team.team_rank()); - //nnz_lno_t ts = static_cast(team.team_size()); + // nnz_lno_t my_thread = static_cast(team.team_rank()); + // nnz_lno_t ts = static_cast(team.team_size()); nnz_lno_t k1 = static_cast(L_row_map(rowid)); nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); @@ -403,7 +403,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, size_type nlevels = thandle.get_num_levels(); - // Keep these as host View, create device version and copy back to host HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); HandleDeviceEntriesType level_idx = thandle.get_level_idx(); From d615dd1f591832168aeaf13c685587ef8535fb8b Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 26 Jul 2022 14:47:55 -0700 Subject: [PATCH 250/261] Remove unused typedef --- src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index efc60e5ff0..a0cfd1e3cc 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -394,7 +394,6 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, LValuesType &L_values, const URowMapType &U_row_map, const UEntriesType &U_entries, UValuesType &U_values) { using execution_space = typename IlukHandle::execution_space; - using memory_space = typename IlukHandle::memory_space; using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; From 4f173b98de5593a4de6736e9e751b4a80cb1ef3e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 25 Jul 2022 17:43:33 -0600 Subject: [PATCH 251/261] Newton sover: serial on device implementation of Newton's method This is currently experimental and does not have a polished public interface so it is kept in experimental. Will also add a test that solves a linear system which should converge in a single iteration. The residual norm and the alpha line search parameters do not need to be stored as views. Simple stack variables are appropriate and if needed these local values can be stored in the handle for later retrival as is the case of the "lastResidual". Signed-off-by: Luc Berger-Vergiat --- src/blas/impl/KokkosBlas_Newton_impl.hpp | 240 +++++++++++++++++++++++ unit_test/blas/Test_Blas.hpp | 3 + unit_test/blas/Test_Blas_Newton.hpp | 187 ++++++++++++++++++ 3 files changed, 430 insertions(+) create mode 100644 src/blas/impl/KokkosBlas_Newton_impl.hpp create mode 100644 unit_test/blas/Test_Blas_Newton.hpp diff --git a/src/blas/impl/KokkosBlas_Newton_impl.hpp b/src/blas/impl/KokkosBlas_Newton_impl.hpp new file mode 100644 index 0000000000..02618c3141 --- /dev/null +++ b/src/blas/impl/KokkosBlas_Newton_impl.hpp @@ -0,0 +1,240 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBATCHED_ODE_NEWTON_HPP__ +#define __KOKKOSBATCHED_ODE_NEWTON_HPP__ + +#include "Kokkos_Core.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" + +namespace KokkosBlas { +namespace Impl { + +enum class NewtonSolverStatus { Converged = 0, LinearSolveFailure, MaxIters }; + +std::ostream& operator<<(std::ostream& os, NewtonSolverStatus& status) { + switch (status) { + case NewtonSolverStatus::Converged: os << "Newton Solver Converged!"; break; + case NewtonSolverStatus::LinearSolveFailure: + os << "Newton: Linear Solver Failure"; + break; + case NewtonSolverStatus::MaxIters: + os << "Newton reached maximum iterations without convergence."; + break; + } + return os; +} + +/// \brief NewtonHandle +/// +/// This handle is used to pass information between the Newton Solver and +/// the calling code. +/// +/// \tparam: NormViewType: Type of view used to store the residual convergence +/// history + +template +struct NewtonHandle { + using norm_type = typename NormViewType::non_const_value_type; + + NormViewType lastResidual; // Residual of last successful iteration + typename NormViewType::HostMirror lastResidualHost; + + // NormViewType residual_norms; + // TODO: Making these public for now. Should make private and access + // via setters and getters? + int maxIters; // Maximum number of Newton steps + norm_type relativeTol; // Relative convergence tolerance + bool debug_mode; // Returns extra verbose output if true. + + NewtonHandle(int _maxIters = 25, double _relativeTol = 1.0e-6, + bool _debug = false) + : lastResidual("ending Residual norm", 1), + lastResidualHost("end res norm host", 1), + maxIters(_maxIters), + relativeTol(_relativeTol), + debug_mode(_debug) {} + + KOKKOS_FUNCTION + void set_residual(const norm_type val) const { lastResidual(0) = val; } + + KOKKOS_FUNCTION + norm_type get_residual() const { return lastResidual(0); } + + norm_type get_residual_host() const { + Kokkos::deep_copy(lastResidualHost, lastResidual); + return lastResidualHost(0); + } + +}; // NewtonHandle + +/// \brief Newton Functor: +/// Solves the nonlinear system F(x) = 0 +/// where F is a map from R^n to R^n. +/// \tparam System: Struct that allows the evaluation +/// of the residual and jacobian using the +/// residual() and jacobian() methods. +/// \tparam Matrix: rank-2 view-type +/// \tparam XVector: rank-1 view-type +/// \tparam YVector: rank-1 view-type +/// \param +/// \param X [in]: Input vector X, a rank 1 view +/// \param Y [in/out]: Output vector Y, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// +template +struct NewtonFunctor { + using execution_space = typename YVector::execution_space; + using yvalue_type = typename YVector::non_const_value_type; + using norm_type = typename NewtonHandleType::norm_type; + + System sys; + XVector x; + YVector rhs; + NewtonHandleType handle; + + Matrix J, tmp; + XVector update; + + NewtonFunctor(System _sys, XVector _x, YVector _rhs, + NewtonHandleType& _handle) + : sys(_sys), x(_x), rhs(_rhs), handle(_handle) { + J = Matrix("Jacobian", x.extent(0), x.extent(0)); + tmp = Matrix("Jacobian", x.extent(0), x.extent(0) + 4); + update = XVector("update", x.extent(0)); + } + + KOKKOS_INLINE_FUNCTION + NewtonSolverStatus solve() const { + norm_type norm = Kokkos::ArithTraits::zero(); + yvalue_type alpha = Kokkos::ArithTraits::one(); + handle.set_residual(-1); // init to dummy value + + // Iterate until maxIts or the tolerance is reached + for (int it = 0; it < handle.maxIters; ++it) { + // compute initial rhs + sys.residual(x, rhs); + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("NewtonFunctor: r="); + for (int k = 0; k < rhs.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", rhs(k)); + } + } + + // Solve the following linearized + // problem at each step: J*update=-rhs + // with J=du/dx, rhs=f(u_n+update)-f(u_n) + norm = KokkosBlas::serial_nrm2(rhs); + handle.set_residual(norm); + + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Iteration: %d Current res norm is: %e \n Current " + "soln is:\n", + it, (double)handle.get_residual()); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + + if (norm < handle.relativeTol) { + // Problem solved, exit the functor + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Newton solver converged! Ending norm is: %e \n " + "Solution x is: " + "\n", + norm); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + return NewtonSolverStatus::Converged; + } + + // compute LHS + sys.jacobian(x, J); + + // solve linear problem + int linSolverStat = KokkosBatched::SerialGesv< + KokkosBatched::Gesv::StaticPivoting>::invoke(J, update, rhs, tmp); + KokkosBlas::SerialScale::invoke(-1, update); + + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Print linear solve solution: \n"); + for (int k = 0; k < update.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", update(k)); + } + } + if (linSolverStat == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Linear solve gesv returned failure! \n"); + return NewtonSolverStatus::LinearSolveFailure; + } + + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, x); + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Print updated solution: \n"); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + } + return NewtonSolverStatus::MaxIters; + } // End solve functor. +}; + +} // namespace Impl +} // namespace KokkosBlas +#endif // __KOKKOSBATCHED_ODE_NEWTON_HPP__ diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 77b5d14bc4..c607e74ca8 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -50,6 +50,9 @@ #include "Test_Blas3_trmm.hpp" #include "Test_Blas3_trsm.hpp" +// Stuff that should move later on +#include "Test_Blas_Newton.hpp" + // TPLs #include "Test_Blas_rocblas.hpp" diff --git a/unit_test/blas/Test_Blas_Newton.hpp b/unit_test/blas/Test_Blas_Newton.hpp new file mode 100644 index 0000000000..600ba3e0b6 --- /dev/null +++ b/unit_test/blas/Test_Blas_Newton.hpp @@ -0,0 +1,187 @@ +#include + +#include +#include + +namespace Test { + +// Logistic equation +// dy/dt=y(1-y) +// +// solution y = 1/(1+exp(-t)) +// y(0)=0.5 +// +// Using BDF1 to integrate: +// y-y_n=dt*y*(1-y) +// +// Residual: r = y - y_n - dt*y*(1-y) +// Jacobian: J = 1 - dt + 2*dt*y +template +struct LogisticEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + const int neqs = 1; + scalar_type dt; + vec_type state; + + LogisticEquation(const scalar_type dt_, vec_type initial_state) + : dt(dt_), state(initial_state) {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { + dydt(0) = y(0) - state(0) - dt * y(0) * (1 - y(0)); + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 1 - dt + 2 * dt * y(0); + } + + KOKKOS_FUNCTION scalar_type expected_val(const scalar_type t) const { + using Kokkos::exp; + + return static_cast(1 / (1 + exp(-t))); + } + + KOKKOS_FUNCTION int num_equations() const { return neqs; } +}; + +// Intersection of square and hyperbola +// x^2 + y^2 = 20 +// x^2 - y^2 = -2 +// +// solution: x = +/- 3 +// y = +/- sqrt(11) +// +// Residual: r = [x^2 + y^2 - 20] +// [x^2 - y^2 + 2] +// Jacobian: J = [2*x, 2*y] +// [2*x, -2*y] +template +struct Intersection { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + const int neqs = 2; + + Intersection() = default; + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { + dydt(0) = y(0) * y(0) + y(1) * y(1) - 20; + dydt(1) = y(0) * y(0) - y(1) * y(1) + 2; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = 2 * y(0); + jac(1, 1) = -2 * y(1); + } + + KOKKOS_FUNCTION int num_equations() const { return neqs; } +}; + +template +struct NewtonWrapper { + solver newton_solver; + + NewtonWrapper(solver newton_solver_) : newton_solver(newton_solver_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const int /* system_index */) const { newton_solver.solve(); } +}; + +template +int test_logistic() { + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + using norm_type = typename Kokkos::View; + using handle_type = KokkosBlas::Impl::NewtonHandle; + using system_type = LogisticEquation; + using newton_type = + KokkosBlas::Impl::NewtonFunctor; + + // Create the non-linear system and initialize data + vec_type state("state", 1); + Kokkos::deep_copy(state, 0.5); + system_type ode(0.1, state); + + vec_type x("solution vector", 1), rhs("right hand side vector", 1); + Kokkos::deep_copy(x, 0.5); + + // Create the solver and wrapper + handle_type handle; + handle.debug_mode = false; + newton_type newton_solver(ode, x, rhs, handle); + NewtonWrapper wrapper(newton_solver); + + // Launch the problem in a parallel_for + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::parallel_for(my_policy, wrapper); + + // Get the solution back and test it + auto x_h = Kokkos::create_mirror_view(x); + Kokkos::deep_copy(x_h, x); + printf("Non-linear problem solution:\n"); + printf(" [%f]\n", x_h(0)); + + return 0; +} + +template +int test_intersection() { + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + using norm_type = typename Kokkos::View; + using handle_type = KokkosBlas::Impl::NewtonHandle; + using system_type = Intersection; + using newton_type = + KokkosBlas::Impl::NewtonFunctor; + + // Create the non-linear system and initialize data + system_type intersection; + vec_type x("solution vector", 2), rhs("right hand side vector", 2); + { + typename vec_type::HostMirror x_h = Kokkos::create_mirror_view(x); + x_h(0) = 2.5; + x_h(1) = 3.0; + Kokkos::deep_copy(x, x_h); + } + + // Create the solver and wrapper + handle_type handle; + handle.debug_mode = false; + newton_type newton_solver(intersection, x, rhs, handle); + NewtonWrapper wrapper(newton_solver); + + // Launch the problem in a parallel_for + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::parallel_for(my_policy, wrapper); + + // Get the solution back and test it + auto x_h = Kokkos::create_mirror_view(x); + Kokkos::deep_copy(x_h, x); + printf("Non-linear problem solution:\n"); + for (int idx = 0; idx < x_h.extent_int(0); ++idx) { + printf(" [%f]\n", x_h(idx)); + } + EXPECT_NEAR_KK(x_h(0), 3.0, 3.0e-4); + EXPECT_NEAR_KK(x_h(1), 3.3166247903553998, 3.3166247903553998 * 1.0e-4); + + return 0; +} + +} // namespace Test + +template +int test_newton() { + Test::test_logistic(); + Test::test_intersection(); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, newton_serial) { test_newton(); } +#endif From 4288d2c088a7ecf1f44f25fa6ae3a3661177e861 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 27 Jul 2022 09:50:22 -0600 Subject: [PATCH 252/261] Newton solver: applying clang-format --- src/blas/impl/KokkosBlas_Newton_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blas/impl/KokkosBlas_Newton_impl.hpp b/src/blas/impl/KokkosBlas_Newton_impl.hpp index 02618c3141..a8a8973d41 100644 --- a/src/blas/impl/KokkosBlas_Newton_impl.hpp +++ b/src/blas/impl/KokkosBlas_Newton_impl.hpp @@ -154,7 +154,7 @@ struct NewtonFunctor { KOKKOS_INLINE_FUNCTION NewtonSolverStatus solve() const { - norm_type norm = Kokkos::ArithTraits::zero(); + norm_type norm = Kokkos::ArithTraits::zero(); yvalue_type alpha = Kokkos::ArithTraits::one(); handle.set_residual(-1); // init to dummy value From e40909a8728c860d6a669afdad10b200c0ea3dc6 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Thu, 28 Jul 2022 15:02:05 -0700 Subject: [PATCH 253/261] Fix type for k1 and k2 --- .../impl/KokkosSparse_spiluk_numeric_impl.hpp | 50 +++++++++---------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index a0cfd1e3cc..4af8606dfb 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -245,24 +245,22 @@ struct ILUKLvlSchedTP1NumericFunctor { nnz_lno_t my_team = static_cast(team.league_rank()); nnz_lno_t rowid = static_cast(level_idx(my_team + lev_start)); // map to rowid - // nnz_lno_t my_thread = static_cast(team.team_rank()); - // nnz_lno_t ts = static_cast(team.team_size()); - nnz_lno_t k1 = static_cast(L_row_map(rowid)); - nnz_lno_t k2 = static_cast(L_row_map(rowid + 1)); + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(L_entries(k)); L_values(k) = 0.0; - iw(my_team, col) = k; + iw(my_team, col) = static_cast(k); }); #else Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(L_entries(k)); L_values(k) = 0.0; - iw(my_team, col) = k; + iw(my_team, col) = static_cast(k); }); #endif @@ -274,22 +272,22 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(U_entries(k)); U_values(k) = 0.0; - iw(my_team, col) = k; + iw(my_team, col) = static_cast(k); }); team.team_barrier(); // Unpack the ith row of A - k1 = static_cast(A_row_map(rowid)); - k2 = static_cast(A_row_map(rowid + 1)); + k1 = static_cast(A_row_map(rowid)); + k2 = static_cast(A_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(A_entries(k)); nnz_lno_t ipos = iw(my_team, col); if (col < rowid) @@ -301,12 +299,12 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); // Eliminate prev rows - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - for (nnz_lno_t k = k1; k < k2 - 1; k++) + for (size_type k = k1; k < k2 - 1; k++) #else - for (nnz_lno_t k = k1; k < k2; k++) + for (size_type k = k1; k < k2; k++) #endif { nnz_lno_t prev_row = L_entries(k); @@ -358,26 +356,26 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); // Reset - k1 = static_cast(L_row_map(rowid)); - k2 = static_cast(L_row_map(rowid + 1)); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(L_entries(k)); iw(my_team, col) = -1; }); #else Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(L_entries(k)); iw(my_team, col) = -1; }); #endif - k1 = static_cast(U_row_map(rowid)); - k2 = static_cast(U_row_map(rowid + 1)); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), - [&](const nnz_lno_t k) { + [&](const size_type k) { nnz_lno_t col = static_cast(U_entries(k)); iw(my_team, col) = -1; }); From f3f1059244e604d3bb15f16e64cb48dfb7215cbe Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Mon, 1 Aug 2022 16:08:54 -0600 Subject: [PATCH 254/261] TPLs: adding CUBLAS in the list of dependencies This was a strang oversight that creates issues with the Trilinos build of Kokkos Kernels. --- cmake/Dependencies.cmake | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 2dcedcc1c9..e8b1c6a5e2 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,7 +1,12 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in -# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake. \ No newline at end of file +# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake. + +if (TPL_ENABLE_CUDA) + tribits_tpl_tentatively_enable(CUBLAS) +endif() + From b87e17e8279acf0f86476d0332bd8d33fee90789 Mon Sep 17 00:00:00 2001 From: "Roscoe A. Bartlett" Date: Fri, 29 Jul 2022 10:24:50 -0600 Subject: [PATCH 255/261] KokkosKernels: Fix install( ... DESTINATION ... ) dir (#10810) This correctly uses just ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR} which if relative, CMake assumes is relative to ${CMAKE_INSTALL_PREFIX}. Fixing this means that: cmake --install . --prefix works correctly. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 836b4963c1..c5261c326a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,7 @@ CMAKE_POLICY(SET CMP0074 NEW) INCLUDE(GNUInstallDirs) IF (KOKKOSKERNELS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) + SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR ${TRILINOS_INCDIR}) SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSEIF(KOKKOSKERNELS_HAS_PARENT) From f2ea13368d13741765df4800a55adf580f0bfeec Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 10 Aug 2022 09:56:58 -0600 Subject: [PATCH 256/261] Merge pull request #1488 from ndellingwood/issue-1487 Add gcc/7.3.0 to macro guard define for KOKKOSKERNELS_ENABLE_OMP_SIMD (cherry picked from commit 2f4d73b56dea60ff4383f204a3103478ae951127) --- src/KokkosKernels_Macros.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/KokkosKernels_Macros.hpp b/src/KokkosKernels_Macros.hpp index 1630028c54..67d86b6e0e 100644 --- a/src/KokkosKernels_Macros.hpp +++ b/src/KokkosKernels_Macros.hpp @@ -66,9 +66,10 @@ // https://clang.llvm.org/docs/OpenMPSupport.html#id1 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) // GCC 4.8.5 and older do not support #pragma omp simd -// Do not enable when using GCC 7.2.0 + C++17 due to a bug in gcc -#if (KOKKOS_COMPILER_GNU > 485) && \ - !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) +// Do not enable when using GCC 7.2.0 or 7.3.0 + C++17 due to a bug in gcc +#if (KOKKOS_COMPILER_GNU > 485) && \ + !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ + !(KOKKOS_COMPILER_GNU == 730 && defined(KOKKOS_ENABLE_CXX17)) #define KOKKOSKERNELS_ENABLE_OMP_SIMD #endif // TODO: Check for a clang version that supports #pragma omp simd From fb5918ed1e46f2640c460be9e825fe7523b5f0eb Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 18 Aug 2022 13:18:57 -0600 Subject: [PATCH 257/261] Update to version 3.7.00 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c5261c326a..40d6dd407b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 6) - SET(KokkosKernels_VERSION_PATCH 99) + SET(KokkosKernels_VERSION_MINOR 7) + SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") ENDIF() From 5b443118fbcacbf289a4ab5afcd3b2c94da978c4 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 18 Aug 2022 13:25:16 -0600 Subject: [PATCH 258/261] Adding Changelog for Release 3.7.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.7 --- CHANGELOG.md | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b0ea4553b4..4eb3d438e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,101 @@ # Change Log +## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00) + +### Features: +- Sparse: bsr transpose algorithm [\#1477](https://github.com/kokkos/kokkos-kernels/pull/1477) +- Newton solver: serial on device implementation of Newton's method [\#1479](https://github.com/kokkos/kokkos-kernels/pull/1479) +- Added https://kokkos-kernels.readthedocs.io [\#1451](https://github.com/kokkos/kokkos-kernels/pull/1451) +- Add cuSparse TPL files for CrsMatrix-multivector product [\#1427](https://github.com/kokkos/kokkos-kernels/pull/1427) +- Add batched GESV [\#1384](https://github.com/kokkos/kokkos-kernels/pull/1384) +- Add csc2csr [\#1342](https://github.com/kokkos/kokkos-kernels/pull/1342) +- BSR block SpGEMM implementation [\#1099](https://github.com/kokkos/kokkos-kernels/pull/1099) + +### Deprecations: +- Add template params to forwarding calls in deprecated KokkosKernels::… [\#1441](https://github.com/kokkos/kokkos-kernels/pull/1441) + +### Implemented enhancements: +- SPILUK: Move host allocations to symbolic [\#1480](https://github.com/kokkos/kokkos-kernels/pull/1480) +- Reformat example/fenl files changed in 1382 [\#1464](https://github.com/kokkos/kokkos-kernels/pull/1464) +- trsv: remove assumptions about entry order within rows [\#1463](https://github.com/kokkos/kokkos-kernels/pull/1463) +- sycl: re-enabling test now that dpcpp has made progress [\#1473](https://github.com/kokkos/kokkos-kernels/pull/1473) +- Blas serial axpy and nrm2 [\#1460](https://github.com/kokkos/kokkos-kernels/pull/1460) +- Move Set/Scale unit test to KokkosBlas [\#1455](https://github.com/kokkos/kokkos-kernels/pull/1455) +- Move {Serial,Team,TeamVector} Set to KokkosBlas [\#1454](https://github.com/kokkos/kokkos-kernels/pull/1454) +- dot perf test: adding support for HIP and SYCL backend [\#1453](https://github.com/kokkos/kokkos-kernels/pull/1453) +- csc2csr: update Kokkos_Numeric.hpp header inclusion [\#1449](https://github.com/kokkos/kokkos-kernels/pull/1449) +- Move {Serial,Team,TeamVector}Scale to KokkosBlas [\#1448](https://github.com/kokkos/kokkos-kernels/pull/1448) +- Sparse utils namespace [\#1439](https://github.com/kokkos/kokkos-kernels/pull/1439) +- Arith traits integral nan [\#1438](https://github.com/kokkos/kokkos-kernels/pull/1438) +- Common Utils: removing dependency on Sparse Utils in Common Utils [\#1436](https://github.com/kokkos/kokkos-kernels/pull/1436) +- Common cleanup [\#1431](https://github.com/kokkos/kokkos-kernels/pull/1431) +- Minor changes for half precision paper [\#1429](https://github.com/kokkos/kokkos-kernels/pull/1429) +- Add benchmarks for us-rse escience 2022 half precision paper [\#1422](https://github.com/kokkos/kokkos-kernels/pull/1422) +- Change Controls::getParameter error message from stdout to stderr [\#1416](https://github.com/kokkos/kokkos-kernels/pull/1416) +- Sparse and CI updates [\#1411](https://github.com/kokkos/kokkos-kernels/pull/1411) +- Kokkos_ArithTraits: re-implementation using Kokkos Core [\#1406](https://github.com/kokkos/kokkos-kernels/pull/1406) +- D1 coloring: remove unused but set variable [\#1403](https://github.com/kokkos/kokkos-kernels/pull/1403) +- Add ETI for D1 coloring [\#1401](https://github.com/kokkos/kokkos-kernels/pull/1401) +- Add ETI to SpAdd (symbolic and numeric) [\#1399](https://github.com/kokkos/kokkos-kernels/pull/1399) +- Clean-up src: re-organizing the src directory [\#1398](https://github.com/kokkos/kokkos-kernels/pull/1398) +- Update Batched GMRES [\#1392](https://github.com/kokkos/kokkos-kernels/pull/1392) +- Add verbosity parameter to GMRES example. Turn off for testing. [\#1385](https://github.com/kokkos/kokkos-kernels/pull/1385) +- Value-initialize result of MaxLoc reduction to avoid maybe uninitialized warning [\#1383](https://github.com/kokkos/kokkos-kernels/pull/1383) +- Remove volatile qualifiers in reducer join(), init(), and operator+= methods [\#1382](https://github.com/kokkos/kokkos-kernels/pull/1382) +- sparse: Remove csc2csr copy [\#1375](https://github.com/kokkos/kokkos-kernels/pull/1375) +- Minor updates to cluster Gauss-Seidel [\#1372](https://github.com/kokkos/kokkos-kernels/pull/1372) +- KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes [\#1369](https://github.com/kokkos/kokkos-kernels/pull/1369) +- Restructure docs [\#1368](https://github.com/kokkos/kokkos-kernels/pull/1368) +- Only instantiate Kokkos's default Cuda mem space [\#1361](https://github.com/kokkos/kokkos-kernels/pull/1361) +- GEMV: accumulate in float for scalar = bhalf_t [\#1360](https://github.com/kokkos/kokkos-kernels/pull/1360) +- Newer sparse tests were not following the new testing pattern [\#1356](https://github.com/kokkos/kokkos-kernels/pull/1356) +- Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354) +- perf_test/blas: Check ARMPL build version [\#1352](https://github.com/kokkos/kokkos-kernels/pull/1352) +- Clean-up batched block tridiag perf test [\#1343](https://github.com/kokkos/kokkos-kernels/pull/1343) +- Reduce lots of macro duplication in sparse unit tests [\#1340](https://github.com/kokkos/kokkos-kernels/pull/1340) +- Add unit test for BsrMatrix and BlockCrsMatrix spmv [\#1338](https://github.com/kokkos/kokkos-kernels/pull/1338) +- Refactor SPGEMM MKL Impl [\#1244](https://github.com/kokkos/kokkos-kernels/pull/1244) + +### Bug Fixes: +- TPLs: adding CUBLAS in the list of dependencies [\#1482](https://github.com/kokkos/kokkos-kernels/pull/1482) +- Fix MKL build errors [\#1478](https://github.com/kokkos/kokkos-kernels/pull/1478) +- Fixup drop layout template param in rank-0 views [\#1476](https://github.com/kokkos/kokkos-kernels/pull/1476) +- BLAS: fixing test that access results before synching [\#1472](https://github.com/kokkos/kokkos-kernels/pull/1472) +- Fix D1 color ETI with both CudaSpace and UVM [\#1471](https://github.com/kokkos/kokkos-kernels/pull/1471) +- Fix arithtraits warning [\#1468](https://github.com/kokkos/kokkos-kernels/pull/1468) +- Fix build when double not instantiated [\#1467](https://github.com/kokkos/kokkos-kernels/pull/1467) +- Fix -Werror [\#1466](https://github.com/kokkos/kokkos-kernels/pull/1466) +- Fix GitHub CI failing on broken develop [\#1461](https://github.com/kokkos/kokkos-kernels/pull/1461) +- HIP: fix warning from ExecSpaceUtils and GEMV [\#1459](https://github.com/kokkos/kokkos-kernels/pull/1459) +- Removes a duplicate cuda_data_type_from when KOKKOS_HALF_T_IS_FLOAT [\#1456](https://github.com/kokkos/kokkos-kernels/pull/1456) +- Fix incorrect function call in KokkosBatched::TeamGEMV unit test [\#1444](https://github.com/kokkos/kokkos-kernels/pull/1444) +- Fix SYCL nightly test [\#1419](https://github.com/kokkos/kokkos-kernels/pull/1419) +- Fix issues with cuSparse TPL availability for BsrMatrix SpMV [\#1418](https://github.com/kokkos/kokkos-kernels/pull/1418) +- SpMV: fixing issues with unit-tests tolerance [\#1412](https://github.com/kokkos/kokkos-kernels/pull/1412) +- Address 1409 [\#1410](https://github.com/kokkos/kokkos-kernels/pull/1410) +- Fix colliding include guards (copy-paste mistake) [\#1408](https://github.com/kokkos/kokkos-kernels/pull/1408) +- src/sparse: Fix & check for fence post errors [\#1405](https://github.com/kokkos/kokkos-kernels/pull/1405) +- Bspgemm fixes [\#1396](https://github.com/kokkos/kokkos-kernels/pull/1396) +- Fix unused parameter warnings in GEMM test. [\#1381](https://github.com/kokkos/kokkos-kernels/pull/1381) +- Fixes code deprecation warnings. [\#1379](https://github.com/kokkos/kokkos-kernels/pull/1379) +- Fix sign-compare warning in SPMV perf test [\#1371](https://github.com/kokkos/kokkos-kernels/pull/1371) +- Minor MKL fixes [\#1365](https://github.com/kokkos/kokkos-kernels/pull/1365) +- perf_test/batched: Temporarily disable tests [\#1359](https://github.com/kokkos/kokkos-kernels/pull/1359) +- Fix nightly builds following promotion of the math functions in Kokkos [\#1339](https://github.com/kokkos/kokkos-kernels/pull/1339) + + +## [3.6.01](https://github.com/kokkos/kokkos-kernels/tree/3.6.01) (2022-05-23) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.00...3.6.01) + +### Bug Fixes and Improvements: + +- Improve spiluk numeric phase to avoid race conditions and processing in chunks [\#1390](https://github.com/kokkos/kokkos-kernels/pull/1390) +- Improve sptrsv symbolic phase performance (level scheduling) [\#1380](https://github.com/kokkos/kokkos-kernels/pull/1380) +- Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354) +- Fix check that view has const type [\#1370](https://github.com/kokkos/kokkos-kernels/pull/1370) +- Fix check that view has const type part 2 [\#1394](https://github.com/kokkos/kokkos-kernels/pull/1394) + ## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) From 3be655677daf71b8b1899014b8df589584d1256a Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Tue, 23 Aug 2022 08:41:21 -0600 Subject: [PATCH 259/261] Reformat changelog by categories Creating categories by topic for new features and enhancements, we could consider moving some of the bug fixes to features/enhancements when the bug fix is really an clean-up of an issue that was not caught by the CI-tests? --- CHANGELOG.md | 81 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4eb3d438e8..d794acb2dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,58 +4,87 @@ [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00) ### Features: + +#### Final Bsr algorithms implemented for multigrid: - Sparse: bsr transpose algorithm [\#1477](https://github.com/kokkos/kokkos-kernels/pull/1477) +- BSR block SpGEMM implementation [\#1099](https://github.com/kokkos/kokkos-kernels/pull/1099) + +#### Adding batched dense linear and non-linear system solvers: +- Add batched GESV [\#1384](https://github.com/kokkos/kokkos-kernels/pull/1384) - Newton solver: serial on device implementation of Newton's method [\#1479](https://github.com/kokkos/kokkos-kernels/pull/1479) + +#### Add sparse matrix conversion: +- Add csc2csr [\#1342](https://github.com/kokkos/kokkos-kernels/pull/1342) +- csc2csr: update Kokkos_Numeric.hpp header inclusion [\#1449](https://github.com/kokkos/kokkos-kernels/pull/1449) +- sparse: Remove csc2csr copy [\#1375](https://github.com/kokkos/kokkos-kernels/pull/1375) + +#### New documentation in readthedocs - Added https://kokkos-kernels.readthedocs.io [\#1451](https://github.com/kokkos/kokkos-kernels/pull/1451) +- Restructure docs [\#1368](https://github.com/kokkos/kokkos-kernels/pull/1368) + +#### Fix issues with TPLs for mutlivector SPMV - Add cuSparse TPL files for CrsMatrix-multivector product [\#1427](https://github.com/kokkos/kokkos-kernels/pull/1427) -- Add batched GESV [\#1384](https://github.com/kokkos/kokkos-kernels/pull/1384) -- Add csc2csr [\#1342](https://github.com/kokkos/kokkos-kernels/pull/1342) -- BSR block SpGEMM implementation [\#1099](https://github.com/kokkos/kokkos-kernels/pull/1099) ### Deprecations: - Add template params to forwarding calls in deprecated KokkosKernels::… [\#1441](https://github.com/kokkos/kokkos-kernels/pull/1441) ### Implemented enhancements: + +#### - SPILUK: Move host allocations to symbolic [\#1480](https://github.com/kokkos/kokkos-kernels/pull/1480) -- Reformat example/fenl files changed in 1382 [\#1464](https://github.com/kokkos/kokkos-kernels/pull/1464) - trsv: remove assumptions about entry order within rows [\#1463](https://github.com/kokkos/kokkos-kernels/pull/1463) -- sycl: re-enabling test now that dpcpp has made progress [\#1473](https://github.com/kokkos/kokkos-kernels/pull/1473) + +#### Hierarchical BLAS algorithms, added and moved from batched: - Blas serial axpy and nrm2 [\#1460](https://github.com/kokkos/kokkos-kernels/pull/1460) - Move Set/Scale unit test to KokkosBlas [\#1455](https://github.com/kokkos/kokkos-kernels/pull/1455) - Move {Serial,Team,TeamVector} Set to KokkosBlas [\#1454](https://github.com/kokkos/kokkos-kernels/pull/1454) -- dot perf test: adding support for HIP and SYCL backend [\#1453](https://github.com/kokkos/kokkos-kernels/pull/1453) -- csc2csr: update Kokkos_Numeric.hpp header inclusion [\#1449](https://github.com/kokkos/kokkos-kernels/pull/1449) - Move {Serial,Team,TeamVector}Scale to KokkosBlas [\#1448](https://github.com/kokkos/kokkos-kernels/pull/1448) -- Sparse utils namespace [\#1439](https://github.com/kokkos/kokkos-kernels/pull/1439) -- Arith traits integral nan [\#1438](https://github.com/kokkos/kokkos-kernels/pull/1438) + +#### Code base organization and clean-ups: - Common Utils: removing dependency on Sparse Utils in Common Utils [\#1436](https://github.com/kokkos/kokkos-kernels/pull/1436) - Common cleanup [\#1431](https://github.com/kokkos/kokkos-kernels/pull/1431) -- Minor changes for half precision paper [\#1429](https://github.com/kokkos/kokkos-kernels/pull/1429) -- Add benchmarks for us-rse escience 2022 half precision paper [\#1422](https://github.com/kokkos/kokkos-kernels/pull/1422) -- Change Controls::getParameter error message from stdout to stderr [\#1416](https://github.com/kokkos/kokkos-kernels/pull/1416) +- Clean-up src: re-organizing the src directory [\#1398](https://github.com/kokkos/kokkos-kernels/pull/1398) +- Sparse utils namespace [\#1439](https://github.com/kokkos/kokkos-kernels/pull/1439) + +#### perf tests updates, fixes and clean-ups: +- dot perf test: adding support for HIP and SYCL backend [\#1453](https://github.com/kokkos/kokkos-kernels/pull/1453) +- Add verbosity parameter to GMRES example. Turn off for testing. [\#1385](https://github.com/kokkos/kokkos-kernels/pull/1385) +- KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes [\#1369](https://github.com/kokkos/kokkos-kernels/pull/1369) +- perf_test/blas: Check ARMPL build version [\#1352](https://github.com/kokkos/kokkos-kernels/pull/1352) +- Clean-up batched block tridiag perf test [\#1343](https://github.com/kokkos/kokkos-kernels/pull/1343) +- Reduce lots of macro duplication in sparse unit tests [\#1340](https://github.com/kokkos/kokkos-kernels/pull/1340) + +#### Infrastructure changes: ETI and testing upgrades, minor fixes +- sycl: re-enabling test now that dpcpp has made progress [\#1473](https://github.com/kokkos/kokkos-kernels/pull/1473) +- Only instantiate Kokkos's default Cuda mem space [\#1361](https://github.com/kokkos/kokkos-kernels/pull/1361) - Sparse and CI updates [\#1411](https://github.com/kokkos/kokkos-kernels/pull/1411) -- Kokkos_ArithTraits: re-implementation using Kokkos Core [\#1406](https://github.com/kokkos/kokkos-kernels/pull/1406) -- D1 coloring: remove unused but set variable [\#1403](https://github.com/kokkos/kokkos-kernels/pull/1403) +- Newer sparse tests were not following the new testing pattern [\#1356](https://github.com/kokkos/kokkos-kernels/pull/1356) - Add ETI for D1 coloring [\#1401](https://github.com/kokkos/kokkos-kernels/pull/1401) - Add ETI to SpAdd (symbolic and numeric) [\#1399](https://github.com/kokkos/kokkos-kernels/pull/1399) -- Clean-up src: re-organizing the src directory [\#1398](https://github.com/kokkos/kokkos-kernels/pull/1398) -- Update Batched GMRES [\#1392](https://github.com/kokkos/kokkos-kernels/pull/1392) -- Add verbosity parameter to GMRES example. Turn off for testing. [\#1385](https://github.com/kokkos/kokkos-kernels/pull/1385) +- Reformat example/fenl files changed in 1382 [\#1464](https://github.com/kokkos/kokkos-kernels/pull/1464) +- Change Controls::getParameter error message from stdout to stderr [\#1416](https://github.com/kokkos/kokkos-kernels/pull/1416) + +#### Kokkos alignment: update our implementations to use newer Kokkos features +- Arith traits integral nan [\#1438](https://github.com/kokkos/kokkos-kernels/pull/1438) +- Kokkos_ArithTraits: re-implementation using Kokkos Core [\#1406](https://github.com/kokkos/kokkos-kernels/pull/1406) - Value-initialize result of MaxLoc reduction to avoid maybe uninitialized warning [\#1383](https://github.com/kokkos/kokkos-kernels/pull/1383) - Remove volatile qualifiers in reducer join(), init(), and operator+= methods [\#1382](https://github.com/kokkos/kokkos-kernels/pull/1382) -- sparse: Remove csc2csr copy [\#1375](https://github.com/kokkos/kokkos-kernels/pull/1375) -- Minor updates to cluster Gauss-Seidel [\#1372](https://github.com/kokkos/kokkos-kernels/pull/1372) -- KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes [\#1369](https://github.com/kokkos/kokkos-kernels/pull/1369) -- Restructure docs [\#1368](https://github.com/kokkos/kokkos-kernels/pull/1368) -- Only instantiate Kokkos's default Cuda mem space [\#1361](https://github.com/kokkos/kokkos-kernels/pull/1361) + +#### BLAS and batched algorithms updates +- Update Batched GMRES [\#1392](https://github.com/kokkos/kokkos-kernels/pull/1392) - GEMV: accumulate in float for scalar = bhalf_t [\#1360](https://github.com/kokkos/kokkos-kernels/pull/1360) -- Newer sparse tests were not following the new testing pattern [\#1356](https://github.com/kokkos/kokkos-kernels/pull/1356) - Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354) -- perf_test/blas: Check ARMPL build version [\#1352](https://github.com/kokkos/kokkos-kernels/pull/1352) -- Clean-up batched block tridiag perf test [\#1343](https://github.com/kokkos/kokkos-kernels/pull/1343) -- Reduce lots of macro duplication in sparse unit tests [\#1340](https://github.com/kokkos/kokkos-kernels/pull/1340) + +#### Sparse and Graph updates +- Minor updates to cluster Gauss-Seidel [\#1372](https://github.com/kokkos/kokkos-kernels/pull/1372) - Add unit test for BsrMatrix and BlockCrsMatrix spmv [\#1338](https://github.com/kokkos/kokkos-kernels/pull/1338) - Refactor SPGEMM MKL Impl [\#1244](https://github.com/kokkos/kokkos-kernels/pull/1244) +- D1 coloring: remove unused but set variable [\#1403](https://github.com/kokkos/kokkos-kernels/pull/1403) + +#### half precision paper +- Minor changes for half precision paper [\#1429](https://github.com/kokkos/kokkos-kernels/pull/1429) +- Add benchmarks for us-rse escience 2022 half precision paper [\#1422](https://github.com/kokkos/kokkos-kernels/pull/1422) + ### Bug Fixes: - TPLs: adding CUBLAS in the list of dependencies [\#1482](https://github.com/kokkos/kokkos-kernels/pull/1482) From c2e29f41a6af44ca74c708e78f57bec505add068 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 25 Aug 2022 21:05:12 -0600 Subject: [PATCH 260/261] Update master_history for Kokkos 3.7.00 --- master_history.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/master_history.txt b/master_history.txt index ddf9143c73..91399d7ba0 100644 --- a/master_history.txt +++ b/master_history.txt @@ -17,3 +17,4 @@ tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86 tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 +tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa From f32debb08a2db7a94b017740e864293976bcac43 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 31 Aug 2022 15:45:58 -0600 Subject: [PATCH 261/261] Delete remant Kokkos_ArithTraits file from src directory --- src/Kokkos_ArithTraits.hpp | 3979 ------------------------------------ 1 file changed, 3979 deletions(-) delete mode 100644 src/Kokkos_ArithTraits.hpp diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp deleted file mode 100644 index 68bcdf79ea..0000000000 --- a/src/Kokkos_ArithTraits.hpp +++ /dev/null @@ -1,3979 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_ARITHTRAITS_HPP -#define KOKKOS_ARITHTRAITS_HPP - -/// \file Kokkos_ArithTraits.hpp -/// \brief Declaration and definition of Kokkos::Details::ArithTraits - -#include -#include -#include -#include - -#ifdef HAVE_KOKKOSKERNELS_QUADMATH -#include -#endif // HAVE_KOKKOSKERNELS_QUADMATH - -#include -#include -#include -#include // std::complex -#include // std::numeric_limits -#ifdef __CUDACC__ -#include -#endif - -namespace { // anonymous - -/// \fn intPowImpl -/// \tparam IntType A built-in integer type. -/// \brief Implementation of intPowSigned and intPowUnsigned. -/// -/// \pre x != 0 -/// \pre y > 0 -/// -/// Use intPowSigned or intPowUnsigned for general y. -template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, - const IntType y) { - // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2 - IntType prod = x; - IntType y_cur = 1; - // If y == 1, then prod stays x. - while (y_cur < y) { - prod = prod * prod; - y_cur = y_cur << 1; - } - // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run - // time to finish the remainder in a linear iteration. - if (y > y_cur) { - const IntType left = y - y_cur; - for (IntType k = 0; k < left; ++k) { - prod = prod * x; - } - } else if (y < y_cur) { - // There's probably a better way to do this in order to avoid the - // (expensive) integer division, but I'm not motivated to think of - // it at the moment. - const IntType left = y_cur - y; - for (IntType k = 0; k < left; ++k) { - prod = prod / x; - } - } - return prod; - - // y = 8: - // - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 - // x^4,4 -> x^8,8 - // - // y = 9: - // - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 - // x^4,4 -> x^8,8 - // - // y - y_cur is what's left over. Just do it one at a time. - // - // y = 3: - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 -} - -// Warning free abs function for types where we don't know whether they are -// signed (like char) -template ::is_signed> -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& val); -}; - -template -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; } -}; - -template -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; } -}; - -/// \fn intPowSigned -/// \tparam IntType A built-in signed integer type. -/// \brief Compute x raised to the power y. -/// -/// If the arguments are invalid (e.g., if x and y are both zero), the -/// result of this function is undefined. However, this function will -/// not throw an exception in that case. -template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } else if (y < 0) { - if (x == 1) { - return 1; - } else if (x == -1) { - return (y % 2 == 0) ? 1 : -1; - } else { - return 0; // round the fraction to zero - } - } - return intPowImpl(x, y); -} -template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } - return intPowImpl(x, y); -} - -/// \fn intPowUnsigned -/// \tparam IntType A built-in unsigned integer type. -/// \brief Compute x raised to the power y. -/// -/// If the arguments are invalid (e.g., if x and y are both zero), the -/// result of this function is undefined. However, this function will -/// not throw an exception in that case. -template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, - const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } else { - return intPowImpl(x, y); - } -} - -// It might make sense to use special sqrt() approximations for -// integer arguments, like those presented on the following web site: -// -// http://www.azillionmonkeys.com/qed/sqroot.html#implementations -// -// Note that some of the implementations on the above page break ANSI -// C(++) aliasing rules (by assigning to the results of -// reinterpret_cast-ing between int and float). It's also just a -// performance optimization and not required for a reasonable -// implementation. - -} // namespace - -namespace Kokkos { -namespace Details { - -/// \class ArithTraits -/// \brief Traits class for arithmetic on type T. -/// \tparam T "Scalar" type of interest -/// -/// This is a traits class for the "arithmetic" type T. "Arithmetic -/// types" include built-in signed and unsigned integer types, -/// floating-point types, complex-valued types, and anything else that -/// looks like these. This class is useful for implementing numerical -/// algorithms that are generic on the data type. You may also use -/// this class to query attributes of T, like whether it is signed or -/// complex, or its precision. -/// -/// We really did not want to implement this class or expose it to -/// users. It would be much better to use existing traits classes -/// like std::numeric_limits. We decided to implement and expose this -/// class for the following reasons: -///
    -///
  1. std::numeric_limits class methods cannot be used in CUDA -/// device functions, since they themselves are not device -/// functions
  2. -///
  3. Existing traits classes like std::numeric_limits do not -/// provide enough information to implement algorithms that are -/// agnostic of whether T is real-valued or complex-valued.
  4. -///
-/// -/// All class methods must be suitable for parallel kernels, if the -/// type T itself is suitable for parallel kernels. In particular, -/// specializations for types T that make sense to use on a CUDA -/// device must mark all class methods as device (and host) functions, -/// using the KOKKOS_FORCEINLINE_FUNCTION macro. All class methods must be -/// callable both inside and outside a parallel kernel (for CUDA, this -/// means they must be marked as both device and host functions). -/// -/// \section Kokkos_ArithTraits_compat Compatibility -/// -/// Whenever possible, class methods in ArithTraits use the same names -/// as their equivalents in the C++ Standard Library. If this was not -/// possible, for example with isInf and isNan, we explain why in -/// their documentation. -/// -/// This class has redundant typedefs and methods in order to maintain -/// backwards compatibility with Teuchos::ScalarTraits, while -/// preferring forwards (partial) compatibility with -/// std::numeric_limits. Users should prefer typedefs, \c bool -/// constants, and class methods compatible with std::numeric_limits, -/// to those from Teuchos::ScalarTraits. The latter may go away at -/// any time. Furthermore, Teuchos::ScalarTraits contains methods -/// that do not make sense for use as parallel device functions, in -/// particular those relating to pseudorandom number generation that -/// refer to hidden state, so we will never include all class methods -/// from Teuchos::ScalarTraits in ArithTraits. -/// -/// \section Kokkos_ArithTraits_unsupp Unsupported types on CUDA devices -/// -/// CUDA does not support long double or std::complex in device -/// functions. ArithTraits does have specializations for these types, -/// but the class methods therein are not marked as device functions. -/// -/// \section Kokkos_ArithTraits_whyNotC99 What about C99 integer types? -/// -/// C99 and C++11 include typedefs int${N}_t and uint${N}_t, where N -/// is the number of bits in the integer. These typedefs are useful -/// because they make the length of the type explicit. Users are -/// welcome to use these types as the template parameter of -/// ArithTraits. -/// -/// We chose not to use these types when defining full -/// specializations of ArithTraits. This is because the C99 integer -/// types are typedefs, not types in themselves. This makes it -/// impossible to avoid duplicate or missing full specializations of -/// ArithTraits. For example, on my Mac, for CUDA 5.5, gcc 4.2.1, and -/// Clang 3.2, int64_t is a typedef of long long, -/// but long long and long are separate types, even -/// though they have the same length (64 bits). In contrast, on -/// Windows (even Win64), long is a 32-bit type (but a -/// distinct type from int), and long long is a -/// 64-bit type. Thus, if we define full specializations of -/// ArithTraits using only the C99 integer types, we will be -/// missing a specialization for long on at least one -/// platform. -/// -/// Rather than trouble ourselves with trying to figure this out for -/// each platform, we decided to provide specializations only for the -/// integer types in the C89 and C++03 language standards. This -/// includes signed and unsigned versions of char, -/// short, int, and long. We also include -/// long long if your platform supports it. We may thus have -/// left out some C99 integer type, but this is only possible if the -/// C89 / C++03 integer types do not have complete coverage of all -/// powers of two bits from 8 up to the longest provided length (e.g., -/// 64 on a 64-bit system). On all platforms I have encountered, -/// char has 8 bits and short has 16 bits, so I am -/// not worried about missing specializations for int16_t or -/// uint16_t. If you should find that either of these -/// specializations are missing, though, please let us know. -/// -/// Note that char, signed char, and unsigned -/// char are distinct types, whether char is signed or -/// unsigned. (The language standards do not specify whether -/// char is signed or unsigned.) That is, char is -/// not a typedef of signed char or unsigned -/// char. This is why we provide full specializations of -/// ArithTraits for each of these types. Interestingly enough, on my -/// system, char and int8_t are different types, but -/// signed char and int8_t are the same. -/// -/// \section Kokkos_ArithTraits_impl Implementation notes -/// -/// This section contains notes to developers who which to add a -/// partial specialization of this class for a new type T. If you -/// decide to write a default templated implementation, it must not -/// declare any methods as device functions. This ensures correct -/// behavior for arbitrary T, but does require specializations for -/// common types like T = float and double, as well as for other types -/// T that make sense to use on a CUDA device. -template -class ArithTraits { - public: - /// \brief A type that acts like T and works with Kokkos. - /// - /// This is usually just an alias for T. However, some types T do - /// not work well with Kokkos. In that case, we use a mostly - /// equivalent type here. For example, ArithTraits - /// >::val_type is Kokkos::complex. - typedef T val_type; - /// \brief The type of the magnitude (absolute value) of T. - /// - /// We define this as the type returned by abs() in this class. If - /// T is real (not complex), then \c val_type and \c mag_type are - /// usually the same. If T is std::complex for some R, - /// then R and \c mag_type are usually the same. - typedef T mag_type; - - //! Whether ArithTraits has a specialization for T. - static const bool is_specialized = false; - //! Whether T is a signed type (has negative values). - static const bool is_signed = false; - //! Whether T is an integer type. - static const bool is_integer = false; - /// \brief Whether T "uses exact representations." - /// - /// The opposite of is_exact is "is approximate," that is, "may - /// commit rounding error." - static const bool is_exact = false; - //! Whether T is a complex-valued type. - static const bool is_complex = false; - - /// \brief Whether x is Inf. - /// - /// This can only be true for floating-point types T that support - /// Inf. If T is a complex type, we say that a T instance x is Inf - /// if and only if isinf(real(x)) || isinf(imag(x)). - /// - /// Unfortunately we can't call this "isinf" (the equivalent C99 - /// function), because CUDA appears to implement that function using - /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const T& x); - - /// \brief Whether x is NaN (not a number). - /// - /// This can only be true for floating-point types T that support - /// NaN. If T is a complex type, we say that a T instance x is NaN - /// if and only if isNan(real(x)) || isNan(imag(x)). - /// - /// Unfortunately we can't call this "isnan" (the equivalent C99 - /// function), because CUDA appears to implement that function using - /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const T& x); - - //! The absolute value (magnitude) of x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const T& x); - - //! The zero value of T; the arithmetic identity. - static KOKKOS_FORCEINLINE_FUNCTION T zero(); - - //! The one value of T; the multiplicative identity. - static KOKKOS_FORCEINLINE_FUNCTION T one(); - - /// \brief True if this type T is capable of representing the - /// positive infinity as a distinct special value, as with - /// std::numeric_limits::has_infinity. - static constexpr bool has_infinity = false; - - /// \brief Returns the special value "positive infinity", as - /// represented by the floating-point type T. Only meaningful if - /// KokkosArithTraits::has_infinity == true. Provides same - /// functionality as std::numeric_limits::infinity(). - /// - /// \note Would have liked to mark it as constexpr but then would - /// not be able to provide the specialization for std::complex - /// since its constructor only becomes constexpr with C++14. - static KOKKOS_FORCEINLINE_FUNCTION T infinity(); - - /// \brief The minimum possible value of T. - /// - /// If T is a real floating-point type, then this is the minimum - /// positive value, as with std::numeric_limits::min(). - static KOKKOS_FORCEINLINE_FUNCTION T min(); - - //! The maximum possible value of T. - static KOKKOS_FORCEINLINE_FUNCTION T max(); - - /// \brief The real part of x. - /// - /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const T& x); - - /// \brief The imaginary part of x. - /// - /// If \c is_complex is false, then this just returns zero(). - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const T&); - - /// \brief The complex conjugate of x. - /// - /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION T conj(const T&); - - //! x raised to the power y. - static KOKKOS_FORCEINLINE_FUNCTION T pow(const T& x, const T& y); - - /// \brief The square root of x. - /// - /// If T is an integer type, this is the floor of the square root. - /// If T is a complex-valued type, then this method returns the - /// principal branch of the square root. - /// - /// If T is real-valued and x is negative, the result of the square - /// root is undefined in general. (CUDA does not allow throwing - /// exceptions in device functions.) Implementations should return - /// NaN if the type T supports this. Of course, in that case, the - /// square of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T sqrt(const T& x); - - /// \brief The cubic root of x. - /// - /// If T is an integer type, this is the floor of the cubic root. - /// If T is a complex-valued type, then this method returns the - /// principal branch of the cubic root. - /// - /// If T is real-valued and x is negative, the result of the cubic - /// root is undefined in general. (CUDA does not allow throwing - /// exceptions in device functions.) Implementations should return - /// NaN if the type T supports this. Of course, in that case, the - /// cubic of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T cbrt(const T& x); - - /// \brief The natural (base e) exponential function of x. - /// - /// If T is an integer type, this is the floor of the exponential - /// function. If T is a complex-valued type, then this method - /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$. - /// - static KOKKOS_FORCEINLINE_FUNCTION T exp(const T& x); - - /// \brief The natural (base e) logarithm of x. - /// - /// If T is an integer type, this is the floor of the logarithm. If - /// T is a complex-valued type, then this method returns the - /// principal branch of the logarithm. - /// - /// If T is real-valued and x is negative, the result of the - /// logarithm is undefined in general. (CUDA does not allow - /// throwing exceptions in device functions.) Implementations - /// should return NaN if the type T supports this. Of course, in - /// that case, if y is the result, \f$e^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log(const T& x); - - /// \brief The base ten logarithm of the input. - /// - /// If T is an integer type, this is the floor of the logarithm. If - /// T is a complex-valued type, then this method returns the - /// principal branch of the logarithm. - /// - /// If T is real-valued and x is negative, the result of the - /// logarithm is undefined in general. (CUDA does not allow - /// throwing exceptions in device functions.) Implementations - /// should return NaN if the type T supports this. Of course, in - /// that case, if y is the result, \f$10^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log10(const T& x); - - /// Trigonometric and hyperbolic functions are not available - /// for integer types. This is because asin(sin(x)) is not x - /// when x is integer with a rounding error. - /// - /// KJ: log, exp also has this problem. We probably need to - /// disable them for integer types instead of providing - /// functionality with floor. - - /// \brief The sin function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T sin(const T& x); - - /// \brief The cos function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T cos(const T& x); - - /// \brief The tan function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T tan(const T& x); - - /// \brief The sin hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T sinh(const T& x); - - /// \brief The cos hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T cosh(const T& x); - - /// \brief The tan hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T tanh(const T& x); - - /// \brief The asin function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T asin(const T& x); - - /// \brief The acos function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T acos(const T& x); - - /// \brief The atan function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T atan(const T& x); - - /// \brief Return a silent NaN, if appropriate for T. - /// - /// If T does not implement a silent NaN, the return value is - /// undefined, but calling this method is still allowed. - static KOKKOS_FORCEINLINE_FUNCTION T nan(); - - /// \brief Machine epsilon. - /// - /// If T is an integer type (std::numeric_traits::is_exact is - /// true), then epsilon() returns 0. Otherwise, if T is a - /// floating-point type, it returns machine epsilon that T. - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon(); - - //@{ - /// \name Traits defined for backwards compatibility with - /// Teuchos::ScalarTraits - /// - /// All of the typedefs, \c bool constants, and class methods in - /// this section are defined in order that one may replace most uses - /// of Teuchos::ScalarTraits with ArithTraits. Users who do not - /// have this backwards compatibility requirement should prefer - /// equivalents in other sections. Those class methods which have - /// the same name and meaning in both Teuchos::ScalarTraits and this - /// class, such as log() and pow(), are not in this section. - - //! Same as mag_type; the type of the absolute value (magnitude) of T. - typedef T magnitudeType; - - /// \brief The type with "half the precision" of T. - /// - /// This typedef only makes sense if T is a floating-point type. - typedef T halfPrecision; - - /// \brief The type with "twice the the precision" of T. - /// - /// This typedef only makes sense if T is a floating-point type. - typedef T doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = false; - - /// \brief True if this type T has floating-point parameters. - /// - /// This is true if and only if this specialization of ArithTraits - /// has "machine-specific" parameters eps(), sfmin(), base(), - /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating - /// to floating-point types. - static const bool hasMachineParameters = false; - - //! Return relative machine precision. - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps(); - - //! Return safe minimum (sfmin), such that 1/sfmin does not overflow. - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin(); - - //! Return the base of the scalar type T. - static KOKKOS_FORCEINLINE_FUNCTION int base(); - - //! Return eps*base. - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec(); - - //! Returns the number of (base) digits in the significand. - static KOKKOS_FORCEINLINE_FUNCTION int t(); - - //! 1.0 when rounding occurs in addition, else 0.0. - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd(); - - //! Returns the minimum exponent before (gradual) underflow. - static KOKKOS_FORCEINLINE_FUNCTION int emin(); - - //! Returns the underflow threshold: base^(emin-1) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin(); - - //! Returns the largest exponent before overflow. - static KOKKOS_FORCEINLINE_FUNCTION int emax(); - - //! Overflow theshold: (base^emax)*(1-eps) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax(); - - //! Same as abs(); return the magnitude of x. - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const T& x); - - //! Same as conj(); return the complex conjugate of x. - static KOKKOS_FORCEINLINE_FUNCTION T conjugate(const T& x); - - /// \brief Whether x is (silent) NaN or Inf. - /// - /// This is the same as isNan(x) || isInf(x). - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const T& x); - - /// \brief The string name of T. - /// - /// Note that this is not a device function. - static std::string name(); - - //! Same as sqrt(x); the square root of x. - static KOKKOS_FORCEINLINE_FUNCTION T squareroot(const T& x); - //@} -}; - -// Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits if half_t is a backend specialization -#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT -template <> -class ArithTraits { - public: - typedef Kokkos::Experimental::half_t val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_half(HUGE_VALF); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_half( - fabs(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_half(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_half(1.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_half(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::Experimental::cast_to_half( - ::pow(Kokkos::Experimental::cast_from_half(x), - Kokkos::Experimental::cast_from_half(y))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sqrt(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_half(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::exp(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::log(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::log10(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sin(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::cos(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_half(x)) -#else - ::tan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sinh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::cosh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::tanh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_half(x)) -#else - ::asin(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_half(x)) -#else - ::acos(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_half(x)) -#else - ::atan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); - } - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "half-float" type. - typedef val_type halfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "half"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_half(CUDART_NAN_F); -#else - return Kokkos::Experimental::cast_to_half( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_FP16_RADIX; - } - // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_half(1.0); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } -}; -#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF - -// Since Kokkos::Experimental::bhalf_t falls back to float, only define -// ArithTraits if bhalf_t is a backend specialization -#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT -template <> -class ArithTraits { - public: - typedef Kokkos::Experimental::bhalf_t val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_bhalf(HUGE_VALF); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - fabs(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_bhalf(1.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::Experimental::cast_to_bhalf( - ::pow(Kokkos::Experimental::cast_from_bhalf(x), - Kokkos::Experimental::cast_from_bhalf(y))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::exp(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::log(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::log10(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sin(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::cos(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sinh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::cosh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::tanh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); - } - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "bhalf-float" type. - typedef val_type bhalfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "bhalf"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_bhalf(CUDART_NAN_F); -#else - return Kokkos::Experimental::cast_to_bhalf( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_BF16_RADIX; - } - // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_bhalf(1.0); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } -}; -#endif // KOKKOS_BHALF_T_IS_FLOAT - -template <> -class ArithTraits { - public: - typedef float val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const float x) { - return ::fabs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION float min() { return -FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION float max() { return FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const float) { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float conj(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION float pow(const float x, const float y) { - return ::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION float sqrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float cbrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float exp(const float x) { - return ::exp(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float log(const float x) { - return ::log(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float log10(const float x) { - return ::log10(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float sin(const float x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float cos(const float x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float tan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float sinh(const float x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float cosh(const float x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float tanh(const float x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float asin(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float acos(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float atan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return FLT_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "half-float" type. - typedef float halfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const float x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const float x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float conjugate(const float x) { - return conj(x); - } - static std::string name() { return "float"; } - static KOKKOS_FORCEINLINE_FUNCTION float squareroot(const float x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float nan() { -#if defined(__CUDA_ARCH__) - return CUDART_NAN_F; - // return nan (); //this returns 0??? -#elif defined(__HIP_DEVICE_COMPILE__) - return ::nanf(""); -#else - return std::numeric_limits::quiet_NaN(); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return FLT_MIN; // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { return FLT_RADIX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return FLT_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return FLT_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return FLT_MIN; // ??? // should be base^(emin-1) - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return FLT_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return FLT_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; - -/// \brief Partial specialization for std::complex. -/// -/// The C++ Standard Library (with C++03 at least) only allows -/// std::complex for RealFloatType = float, double, or -/// long double. -template -class ArithTraits > { - public: - //! Kokkos internally replaces std::complex with Kokkos::complex. - typedef ::Kokkos::complex val_type; - typedef RealFloatType mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static std::complex infinity() { - return std::complex(ArithTraits::infinity(), - ArithTraits::infinity()); - } - -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isInf(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(real(x)) || isinf(imag(x)); - } - template <> - static bool isInf(const std::complex& x) { - Kokkos::abort("isInf not available for std::complex!\n"); - return true; - } -#else - static bool isInf(const std::complex& x) { - return Kokkos::Experimental::isinf(real(x)) || - Kokkos::Experimental::isinf(imag(x)); - } -#endif -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isNan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(real(x)) || isnan(imag(x)); - } - template <> - static bool isNan(const std::complex& x) { - Kokkos::abort("isNan not available for std::complex!\n"); - return true; - } -#else - static bool isNan(const std::complex& x) { - return Kokkos::Experimental::isnan(real(x)) || - Kokkos::Experimental::isnan(imag(x)); - } -#endif - static mag_type abs(const std::complex& x) { - return std::abs(x); - } - static std::complex zero() { - return std::complex(ArithTraits::zero(), - ArithTraits::zero()); - } - static std::complex one() { - return std::complex(ArithTraits::one(), - ArithTraits::zero()); - } - static std::complex min() { - return std::complex(ArithTraits::min(), - ArithTraits::zero()); - } - static std::complex max() { - return std::complex(ArithTraits::max(), - ArithTraits::zero()); - } - static mag_type real(const std::complex& x) { - return std::real(x); - } - static mag_type imag(const std::complex& x) { - return std::imag(x); - } - static std::complex conj( - const std::complex& x) { - return std::conj(x); - } - static std::complex pow(const std::complex& x, - const std::complex& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == one()) { - return x; - } else if (y == one() + one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex pow(const std::complex& x, - const RealFloatType& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == ArithTraits::one()) { - return x; - } else if (y == ArithTraits::one() + - ArithTraits::one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex sqrt( - const std::complex& x) { - return std::sqrt(x); - } - static std::complex cbrt( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static std::complex exp(const std::complex& x) { - return std::exp(x); - } - static std::complex log(const std::complex& x) { - return std::log(x); - } - static std::complex log10( - const std::complex& x) { - return std::log10(x); - } - static std::complex sin(const std::complex& x) { - return std::sin(x); - } - static std::complex cos(const std::complex& x) { - return std::cos(x); - } - static std::complex tan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static std::complex sinh( - const std::complex& x) { - return std::sinh(x); - } - static std::complex cosh( - const std::complex& x) { - return std::cosh(x); - } - static std::complex tanh( - const std::complex& x) { - return std::tanh(x); - } - static std::complex asin( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static std::complex acos( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static std::complex atan( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::atan; -#else - using std::atan; -#endif - return atan(x); - } - static std::complex nan() { - const mag_type mag_nan = ArithTraits::nan(); - return std::complex(mag_nan, mag_nan); - } - static mag_type epsilon() { return ArithTraits::epsilon(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef std::complex::halfPrecision> - halfPrecision; - typedef std::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = true; - static bool isnaninf(const std::complex& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const std::complex& x) { - return abs(x); - } - static std::complex conjugate( - const std::complex& x) { - return conj(x); - } - static std::string name() { - return std::string("std::complex<") + ArithTraits::name() + ">"; - } - static std::complex squareroot( - const std::complex& x) { - return sqrt(x); - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return ArithTraits::sfmin(); } - static int base() { return ArithTraits::base(); } - static mag_type prec() { return ArithTraits::prec(); } - static int t() { return ArithTraits::t(); } - static mag_type rnd() { return ArithTraits::one(); } - static int emin() { return ArithTraits::emin(); } - static mag_type rmin() { return ArithTraits::rmin(); } - static int emax() { return ArithTraits::emax(); } - static mag_type rmax() { return ArithTraits::rmax(); } -}; - -template <> -class ArithTraits { - public: - typedef double val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return ::fabs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return -DBL_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return DBL_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return 0.0; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return ::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return ::exp(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return ::log(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return ::log10(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#if defined(__CUDA_ARCH__) - return CUDART_NAN; - // return nan (); // this returns 0 ??? -#elif defined(__HIP_DEVICE_COMPILE__) - return ::nan(""); -#else - return std::numeric_limits::quiet_NaN(); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return DBL_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef float halfPrecision; -#if defined(__CUDA_ARCH__) - typedef double - doublePrecision; // CUDA doesn't support long double, unfortunately -#elif defined(__HIP_DEVICE_COMPILE__) - typedef double - doublePrecision; // HIP does not support long double unfortunately -#else - typedef long double doublePrecision; -#endif // __CUDA_ARCH__ - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "double"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return DBL_MIN; // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return FLT_RADIX; // same for float as for double - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return DBL_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return DBL_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return DBL_MIN; // ??? // should be base^(emin-1) - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return DBL_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return DBL_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; - -// CUDA and HIP do not support long double in device functions, -// so none of the class methods in this specialization are marked -// as device functions. -template <> -class ArithTraits { - public: - typedef long double val_type; - typedef long double mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static long double infinity() { return HUGE_VALL; } - - static bool isInf(const val_type& x) { - using std::isinf; - return isinf(x); - } - static bool isNan(const val_type& x) { - using std::isnan; - return isnan(x); - } - static mag_type abs(const val_type& x) { return ::fabsl(x); } - static val_type zero() { return 0.0; } - static val_type one() { return 1.0; } - static val_type min() { return -LDBL_MAX; } - static val_type max() { return LDBL_MAX; } - static mag_type real(const val_type& x) { return x; } - static mag_type imag(const val_type&) { return zero(); } - static val_type conj(const val_type& x) { return x; } - static val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static val_type sqrt(const val_type& x) { return ::sqrt(x); } - static val_type cbrt(const val_type& x) { return ::cbrtl(x); } - static val_type exp(const val_type& x) { return ::exp(x); } - static val_type log(const val_type& x) { return ::log(x); } - static val_type log10(const val_type& x) { return ::log10(x); } - static val_type sin(const val_type& x) { return ::sin(x); } - static val_type cos(const val_type& x) { return ::cos(x); } - static val_type tan(const val_type& x) { return ::tan(x); } - static val_type sinh(const val_type& x) { return ::sinh(x); } - static val_type cosh(const val_type& x) { return ::cosh(x); } - static val_type tanh(const val_type& x) { return ::tanh(x); } - static val_type asin(const val_type& x) { return ::asin(x); } - static val_type acos(const val_type& x) { return ::acos(x); } - static val_type atan(const val_type& x) { return ::atan(x); } - static val_type nan() { return std::numeric_limits::quiet_NaN(); } - static mag_type epsilon() { return LDBL_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; - // It might be appropriate to use QD's qd_real here. - // For now, long double is the most you get. - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type& x) { return abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static std::string name() { return "long double"; } - static val_type squareroot(const val_type& x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return LDBL_MIN; // ??? - } - static int base() { - return FLT_RADIX; // same for float as for double or long double - } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return LDBL_MANT_DIG; } - static mag_type rnd() { return one(); } - static int emin() { return LDBL_MIN_EXP; } - static mag_type rmin() { return LDBL_MIN; } - static int emax() { return LDBL_MAX_EXP; } - static mag_type rmax() { return LDBL_MAX; } -}; // long double specialization - -#ifdef HAVE_KOKKOSKERNELS_QUADMATH - -// CUDA does not support __float128 in device functions, so none of -// the class methods in this specialization are marked as device -// functions. -template <> -class ArithTraits<__float128> { - public: - typedef __float128 val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static __float128 infinity() { return 1.0q / 0.0q; } - - static bool isInf(const __float128 x) { return isinfq(x); } - static bool isNan(const __float128 x) { return isnanq(x); } - static mag_type abs(const __float128 x) { return fabsq(x); } - static __float128 zero() { return 0.0; } - static __float128 one() { return 1.0; } - static __float128 min() { return FLT128_MIN; } - static __float128 max() { return FLT128_MAX; } - static mag_type real(const __float128 x) { return x; } - static mag_type imag(const __float128 /* x */) { return 0.0; } - static __float128 conj(const __float128 x) { return x; } - static __float128 pow(const __float128 x, const __float128 y) { - return powq(x, y); - } - static __float128 sqrt(const __float128 x) { return sqrtq(x); } - static __float128 cbrt(const __float128 x) { return cbrtq(x); } - static __float128 exp(const __float128 x) { return exp(x); } - static __float128 log(const __float128 x) { return logq(x); } - static __float128 log10(const __float128 x) { return log10q(x); } - static __float128 sin(const __float128 x) { return sinq(x); } - static __float128 cos(const __float128 x) { return cosq(x); } - static __float128 tan(const __float128 x) { return tanq(x); } - static __float128 sinh(const __float128 x) { return sinhq(x); } - static __float128 cosh(const __float128 x) { return coshq(x); } - static __float128 tanh(const __float128 x) { return tanhq(x); } - static __float128 asin(const __float128 x) { return asinq(x); } - static __float128 acos(const __float128 x) { return acosq(x); } - static __float128 atan(const __float128 x) { return atanq(x); } - static mag_type epsilon() { return FLT128_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; - // Unfortunately, we can't rely on a standard __float256 type. - typedef __float128 doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } - static magnitudeType magnitude(const __float128 x) { return abs(x); } - static __float128 conjugate(const __float128 x) { return conj(x); } - static std::string name() { return "__float128"; } - static __float128 squareroot(const __float128 x) { return sqrt(x); } - static __float128 nan() { - return strtoflt128("NAN()", NULL); // ??? - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return FLT128_MIN; // ??? - } - static int base() { return 2; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return FLT_MANT_DIG; } - static mag_type rnd() { return 1.0; } - static int emin() { return FLT128_MIN_EXP; } - static mag_type rmin() { - return FLT128_MIN; // ??? // should be base^(emin-1) - } - static int emax() { return FLT128_MAX_EXP; } - static mag_type rmax() { - return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; -#endif // HAVE_KOKKOSKERNELS_QUADMATH - -template <> -class ArithTraits< ::Kokkos::complex > { - public: - typedef ::Kokkos::complex val_type; - typedef float mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return std::sqrt(::Kokkos::real(x) * ::Kokkos::real(x) + - ::Kokkos::imag(x) * ::Kokkos::imag(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x.real(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { - return x.imag(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? - return val_type(ArithTraits::nan(), ArithTraits::nan()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? - } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return ArithTraits::base(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return ArithTraits::t(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return ArithTraits::rnd(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return ArithTraits::emin(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return ArithTraits::rmin(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return ArithTraits::emax(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return ArithTraits::rmax(); - } -}; - -template <> -class ArithTraits< ::Kokkos::complex > { - public: - typedef ::Kokkos::complex val_type; - typedef double mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return ::Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x.real(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { - return x.imag(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? - return val_type(ArithTraits::nan(), ArithTraits::nan()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? - } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return ArithTraits::base(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return ArithTraits::t(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return ArithTraits::rnd(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return ArithTraits::emin(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return ArithTraits::rmin(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return ArithTraits::emax(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return ArithTraits::rmax(); - } -}; - -template <> -class ArithTraits { - public: - typedef char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - // The C(++) standard does not require that char be signed. In - // fact, signed char, unsigned char, and char are distinct types. - // We can use std::numeric_limits here because it's a const bool, - // not a class method. - static const bool is_signed = std::numeric_limits::is_signed; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // This avoids warnings based on whether char is signed or unsigned - return integer_abs::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return CHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return CHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - if (is_signed) { - return intPowSigned(x, y); - } else { - return intPowUnsigned(x, y); - } - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // C++11 defines std::sqrt for integer arguments. However, we - // currently can't assume C++11. - // - // This cast will result in no loss of accuracy, though it might - // be more expensive than it should, if we were clever about using - // bit operations. - // - // We take the absolute value first to avoid negative arguments. - // Negative real arguments to sqrt(float) return (float) NaN, but - // built-in integer types do not have an equivalent to NaN. - // Casting NaN to an integer type will thus result in some integer - // value which appears valid, but is not. We cannot raise an - // exception in device functions. Thus, we prefer to take the - // absolute value of x first, to avoid issues. Another - // possibility would be to test for a NaN output and convert it to - // some reasonable value (like 0), though this might be more - // expensive than the absolute value interpreted using the ternary - // operator. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef signed char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return SCHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SCHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "signed char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UCHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef short val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like this work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return SHRT_MIN; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SHRT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - //! Integer square root returns a lower bound. - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // short doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return static_cast(-1); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned short val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return USHRT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned short doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef int val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like INT_MIN work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return INT_MIN; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return INT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // int doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned int val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UINT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned int doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::abs; - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(abs(x)))); -#else - return static_cast(sqrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(x))); -#else - return static_cast(sqrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(::cbrtl(static_cast(x))); -#else - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef long long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LLONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LLONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::sqrt; - // IEEE 754 promises that long double has at least 64 significand - // bits, so we can use it to represent any signed or unsigned - // 64-bit integer type exactly. However, CUDA does not implement - // long double for device functions. - return static_cast(sqrt(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - // Casting from a 64-bit integer type to double does result in a - // loss of accuracy. However, it gives us a good first - // approximation. For very large numbers, we may lose some - // significand bits, but will always get within a factor of two - // (assuming correct rounding) of the exact double-precision - // number. We could then binary search between half the result - // and twice the result (assuming the latter is <= INT64_MAX, - // which it has to be, so we don't have to check) to ensure - // correctness. It actually should suffice to check numbers - // within 1 of the result. - return static_cast(sycl::sqrt(static_cast(abs(x)))); -#else - return static_cast(::sqrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::cbrtl; - return static_cast(cbrtl(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(abs(x)))); -#else - return static_cast(::cbrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long long doesn't implement a NaN value, but we can still have - // it return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned long long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // unsigned integers are always nonnegative - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULLONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::sqrt; - return static_cast(sqrt(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::sqrt(static_cast(x))); -#else - return static_cast(::sqrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(cbrtl(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(x))); -#else - return static_cast(::cbrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long long doesn't implement a NaN value, but we can - // still have it return some "flag" value that can help users find - // use of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -// dd_real and qd_real are floating-point types provided by the QD -// library of David Bailey (LBNL): -// -// http://crd-legacy.lbl.gov/~dhbailey/mpdist/ -// -// dd_real uses two doubles (128 bits), and qd_real uses four doubles -// (256 bits). -// -// Kokkos does not currently support these types in device -// functions. It should be possible to use Kokkos' support for -// aggregate types to implement device function support for dd_real -// and qd_real, but we have not done this yet (as of 09 Jan 2015). -// Hence, the class methods of the ArithTraits specializations for -// dd_real and qd_real are not marked as device functions. -#ifdef HAVE_KOKKOS_QD -template <> -struct ArithTraits { - typedef dd_real val_type; - typedef dd_real mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static inline bool isInf(const val_type& x) { return isinf(x); } - static inline bool isNan(const val_type& x) { return isnan(x); } - static inline mag_type abs(const val_type& x) { return ::abs(x); } - static inline val_type zero() { return val_type(0.0); } - static inline val_type one() { return val_type(1.0); } - static inline val_type min() { return std::numeric_limits::min(); } - static inline val_type max() { return std::numeric_limits::max(); } - static inline mag_type real(const val_type& x) { return x; } - static inline mag_type imag(const val_type&) { return zero(); } - static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static inline val_type sqrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static inline val_type cbrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static inline val_type exp(const val_type& x) { return ::exp(x); } - static inline val_type log(const val_type& x) { - // dd_real puts its transcendental functions in the global namespace. - return ::log(x); - } - static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static inline val_type nan() { return val_type::_nan; } - static val_type epsilon() { return std::numeric_limits::epsilon(); } - - typedef dd_real magnitudeType; - typedef double halfPrecision; - typedef qd_real doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return min(); } - static int base() { return std::numeric_limits::radix; } - static mag_type prec() { return eps() * base(); } - static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } - static int emin() { return std::numeric_limits::min_exponent; } - static mag_type rmin() { return std::numeric_limits::min(); } - static int emax() { return std::numeric_limits::max_exponent; } - static mag_type rmax() { return std::numeric_limits::max(); } - static mag_type magnitude(const val_type& x) { return ::abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static std::string name() { return "dd_real"; } - static val_type squareroot(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } -}; - -template <> -struct ArithTraits { - typedef qd_real val_type; - typedef qd_real mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static inline bool isInf(const val_type& x) { return isinf(x); } - static inline bool isNan(const val_type& x) { return isnan(x); } - static inline mag_type abs(const val_type& x) { return ::abs(x); } - static inline val_type zero() { return val_type(0.0); } - static inline val_type one() { return val_type(1.0); } - static inline val_type min() { return std::numeric_limits::min(); } - static inline val_type max() { return std::numeric_limits::max(); } - static inline mag_type real(const val_type& x) { return x; } - static inline mag_type imag(const val_type&) { return zero(); } - static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static inline val_type sqrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static inline val_type cbrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static inline val_type exp(const val_type& x) { return ::exp(x); } - static inline val_type log(const val_type& x) { - // val_type puts its transcendental functions in the global namespace. - return ::log(x); - } - static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static inline val_type nan() { return val_type::_nan; } - static inline val_type epsilon() { - return std::numeric_limits::epsilon(); - } - - typedef qd_real magnitudeType; - typedef dd_real halfPrecision; - // The QD library does not have an "oct-double real" class. One - // could use an arbitrary-precision library like MPFR or ARPREC, - // with the precision set appropriately, to get an - // extended-precision type for qd_real. - typedef qd_real doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return min(); } - static int base() { return std::numeric_limits::radix; } - static mag_type prec() { return eps() * base(); } - static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } - static int emin() { return std::numeric_limits::min_exponent; } - static mag_type rmin() { return std::numeric_limits::min(); } - static int emax() { return std::numeric_limits::max_exponent; } - static mag_type rmax() { return std::numeric_limits::max(); } - static mag_type magnitude(const val_type& x) { return ::abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static std::string name() { return "qd_real"; } - static val_type squareroot(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } -}; -#endif // HAVE_KOKKOS_QD - -} // namespace Details - -// Promote ArithTraits into Kokkos namespace. At some point, we -// will remove it from the Details namespace completely. We leave -// it there for now, because a lot of code depends on it being -// there. -using Details::ArithTraits; -} // namespace Kokkos - -#endif // KOKKOS_ARITHTRAITS_HPP