From b894b6cc6780491b28a35a9909c7bb5c79bdb847 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Tue, 31 Mar 2020 15:53:25 -0600 Subject: [PATCH 001/126] remove commented-out code.. --- .../impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 0f855485f3..619c2811f2 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -432,18 +432,7 @@ namespace KokkosSparse{ values (nnzL) = values_view (k); nnzL ++; } else if (column_view (k) == i) { - #if 0 // Kokkos' sptrsv assumes diagonal of L and U to come at end and start - if (two_stage) { - diags (i) = one / values_view (k); - } else { - values (nnzL) = values_view (k); - values2 (nnzU) = values_view (k); - nnzL ++; - nnzU ++; - } - #else diags (i) = values_view (k); - #endif } else if (column_view (k) < num_rows) { values2 (nnzU) = values_view (k); nnzU ++; From 38fca067f687ed5c6ac03ef3f3e81954b062b458 Mon Sep 17 00:00:00 2001 From: Paul Kuberry Date: Thu, 3 Dec 2020 20:18:42 -0700 Subject: [PATCH 002/126] Modified UTV, SolveUTV, and SetIdentity to support m x n matrices. Added some team_barrier()'s to QR and Householder algorithms to eliminate a race condition exposed when threads per team >> vector lanes. --- ...atched_Householder_TeamVector_Internal.hpp | 1 + ...osBatched_QR_FormQ_TeamVector_Internal.hpp | 6 +++--- ...WithColumnPivoting_TeamVector_Internal.hpp | 4 +++- .../KokkosBatched_SetIdentity_Impl.hpp | 4 ++-- .../KokkosBatched_SetIdentity_Internal.hpp | 12 +++++------ ...KokkosBatched_SolveUTV_TeamVector_Impl.hpp | 4 ++-- ...osBatched_SolveUTV_TeamVector_Internal.hpp | 21 ++++++++++++------- .../KokkosBatched_UTV_TeamVector_Impl.hpp | 2 +- .../KokkosBatched_UTV_TeamVector_Internal.hpp | 15 ++++++------- 9 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp index 7b3d8b293e..b63ca28fcf 100644 --- a/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Householder_TeamVector_Internal.hpp @@ -68,6 +68,7 @@ namespace KokkosBatched { [&](const int &i) { x2[i*x2s] *= inv_chi1_minus_alpha; }); + member.team_barrier(); // later consider to use the following // SerialScaleInternal::invoke(m_x2, inv_chi1_minus_alpha, x2, x2s); diff --git a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index d1b59d652f..d443bad513 100644 --- a/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, + const int m, const int n, const int k, /* */ ValueType * A, const int as0, const int as1, /* */ ValueType * t, const int ts, @@ -44,12 +44,12 @@ namespace KokkosBatched { if (is_Q_zero) TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0+qs1); else - TeamVectorSetIdentityInternal::invoke(member, m, Q, qs0, qs1); + TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); return TeamVectorApplyQ_LeftForwardInternal ::invoke(member, - m, m, k, + m, n, k, A, as0, as1, t, ts, Q, qs0, qs1, diff --git a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp index 2b0c1e4569..08439b0b28 100644 --- a/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_QR_WithColumnPivoting_TeamVector_Internal.hpp @@ -96,6 +96,7 @@ namespace KokkosBatched { A, as0, as1, A, as0, as1, norm, 1); + member.team_barrier(); const bool finish_when_rank_found = (matrix_rank == -1); @@ -158,7 +159,7 @@ namespace KokkosBatched { if (m_atl == 0) max_diag = ats::abs(A[0]); const value_type val_diag = ats::abs(A_part3x3.A11[0]), - threshold(max_diag*ats::epsilon()); + threshold(10*max_diag*ats::epsilon()); if (val_diag < threshold) { matrix_rank = m_atl; if (finish_when_rank_found) @@ -171,6 +172,7 @@ namespace KokkosBatched { n_A22, A_part3x3.A12, as1, norm_part1x3.A2, 1); + member.team_barrier(); /// ----------------------------------------------------- A_part2x2.mergeToATL (A_part3x3); t_part2x1.mergeToAT (t_part3x1); diff --git a/src/batched/KokkosBatched_SetIdentity_Impl.hpp b/src/batched/KokkosBatched_SetIdentity_Impl.hpp index 4c0ea12348..0bf12243ee 100644 --- a/src/batched/KokkosBatched_SetIdentity_Impl.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Impl.hpp @@ -19,7 +19,7 @@ namespace KokkosBatched { SerialSetIdentity:: invoke(const AViewType &A) { return SerialSetIdentityInternal:: - invoke(A.extent(0), + invoke(A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } @@ -36,7 +36,7 @@ namespace KokkosBatched { const AViewType &A) { return TeamSetIdentityInternal:: invoke(member, - A.extent(0), + A.extent(0), A.extent(1), A.data(), A.stride_0(), A.stride_1()); } diff --git a/src/batched/KokkosBatched_SetIdentity_Internal.hpp b/src/batched/KokkosBatched_SetIdentity_Internal.hpp index 40d8bbbaaf..8f7f6cf3f9 100644 --- a/src/batched/KokkosBatched_SetIdentity_Internal.hpp +++ b/src/batched/KokkosBatched_SetIdentity_Internal.hpp @@ -15,10 +15,10 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static int - invoke(const int m, + invoke(const int m, const int n, /* */ ValueType *__restrict__ A, const int as0, const int as1) { const ValueType one(1), zero(0); - for (int j=0;j ::invoke(member, @@ -133,6 +133,7 @@ namespace KokkosBatched { B, bs0, bs1, zero, W, ws0, ws1); + member.team_barrier(); /// W = T^{-1} W TeamVectorTrsmInternalLeftLower @@ -142,26 +143,31 @@ namespace KokkosBatched { one, T, ts0, ts1, W, ws0, ws1); + member.team_barrier(); /// X = V^T W TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + n, nrhs, matrix_rank, one, V, vs1, vs0, W, ws0, ws1, zero, X, xs0, xs1); + member.team_barrier(); } else { + /// W = U^T B TeamVectorGemmInternal ::invoke(member, - m, nrhs, matrix_rank, + matrix_rank, nrhs, m, one, U, us1, us0, B, bs0, bs1, zero, X, xs0, xs1); + member.team_barrier(); + /// X = T^{-1} X TeamVectorTrsmInternalLeftUpper ::invoke(member, false, @@ -169,12 +175,13 @@ namespace KokkosBatched { one, T, ts0, ts1, X, xs0, xs1); + member.team_barrier(); } /// X = P^T X TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, - nrhs, m, + nrhs, matrix_rank, p, ps0, X, xs0, xs1); diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp index 69b958d22d..b06c76b02a 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Impl.hpp @@ -31,7 +31,7 @@ namespace KokkosBatched { int &matrix_rank) { return TeamVectorUTV_Internal:: invoke(member, - A.extent(0), //A.extent(1), + A.extent(0), A.extent(1), A.data(), A.stride(0), A.stride(1), p.data(), p.stride(0), U.data(), U.stride(0), U.stride(1), diff --git a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp index 6f9a86e115..354dfa7c44 100644 --- a/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_UTV_TeamVector_Internal.hpp @@ -23,7 +23,7 @@ namespace KokkosBatched { KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const int m, // m = NumRows(A) + const int m, const int n, // m = NumRows(A), n = NumCols(A) /* */ ValueType * A, const int as0, const int as1, /* */ IntType * p, const int ps0, /* */ ValueType * U, const int us0, const int us1, @@ -41,23 +41,24 @@ namespace KokkosBatched { matrix_rank = -1; TeamVectorQR_WithColumnPivotingInternal ::invoke(member, - m, m, + m, n, A, as0, as1, t, ts0, p, ps0, work, matrix_rank); - + TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + m, matrix_rank, matrix_rank, A, as0, as1, t, ts0, U, us0, us1, work); + member.team_barrier(); /// for rank deficient matrix - if (matrix_rank < m) { + if (matrix_rank < n) { const value_type zero(0); TeamVectorSetLowerTriangularInternal ::invoke(member, @@ -67,14 +68,14 @@ namespace KokkosBatched { TeamVectorQR_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, A, as1, as0, t, ts0, work); TeamVectorQR_FormQ_Internal ::invoke(member, - m, matrix_rank, + n, matrix_rank, matrix_rank, A, as1, as0, t, ts0, V, vs1, vs0, From 4517ee65454c5a8377870d249d26bd3b8a26db6a Mon Sep 17 00:00:00 2001 From: Paul Kuberry Date: Fri, 4 Dec 2020 22:00:09 -0700 Subject: [PATCH 003/126] Replaced matrix_rank with n for reordering based on pivot in QR. --- src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index 7a844e2e29..f3fd1e80cd 100644 --- a/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -181,7 +181,7 @@ namespace KokkosBatched { /// X = P^T X TeamVectorApplyPivotMatrixBackwardInternal ::invoke(member, - nrhs, matrix_rank, + nrhs, n, p, ps0, X, xs0, xs1); From a60fdc5ea245a36650ba2d9dd37bd56cdb7126a9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sat, 5 Dec 2020 10:32:54 -0700 Subject: [PATCH 004/126] Restore distance-1 default algos from 3.2.0 In PR #828, I changed the default D1 coloring algorithm for CUDA from EB to VBBIT. Although VBBIT is faster for fairly balanced/regular problems, it is causing an increase in MTGS + GMRES iterations in some Ifpack2 tests compared to EB. This causes random failures since the tests expect convergence within a certain number of iters. --- src/graph/KokkosGraph_Distance1ColorHandle.hpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 503c6c9310..077104ef9f 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -239,15 +239,22 @@ class GraphColoringHandle if(exec == KokkosKernels::Impl::Exec_SERIAL) { this->coloring_algorithm_type = COLORING_SERIAL; -#ifdef VERBOSE +#ifdef VERBOSE std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; +#endif + } + else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + { + this->coloring_algorithm_type = COLORING_EB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else { - this->coloring_algorithm_type = COLORING_VBBIT; -#ifdef VERBOSE - std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VBBIT\n"; + this->coloring_algorithm_type = COLORING_VB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_VB\n"; #endif } } From 01cdaff334078c521248bad55dc4b3b1cd148469 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 8 Dec 2020 19:08:20 -0700 Subject: [PATCH 005/126] spadd: remove static constexpr member var Was causing linker errors on IBM xl --- src/sparse/KokkosSparse_spadd.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 9ed66ce2ad..1efae2c1a7 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -47,7 +47,7 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_Sorting.hpp" -#include +#include "Kokkos_ArithTraits.hpp" namespace KokkosSparse { namespace Experimental { @@ -86,10 +86,10 @@ struct SortedCountEntries { Bcolinds(Bcolinds_), Crowcounts(Crowcounts_) {} - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type numEntries = 0; size_type ai = 0; @@ -417,7 +417,6 @@ template struct SortedNumericSumFunctor { using CscalarT = typename CvaluesT::non_const_value_type; - static constexpr ordinal_type ORDINAL_MAX = std::numeric_limits::max(); SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, const BrowptrsT& Browptrs_, @@ -441,6 +440,8 @@ struct SortedNumericSumFunctor { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + // count the union of nonzeros in Arow and Brow size_type ai = 0; size_type bi = 0; From 7328c067c42364f4e25d01ed24cf95ef3d861ccc Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 8 Dec 2020 18:49:50 -0700 Subject: [PATCH 006/126] BitUtils: update xl builtin popcount calls for xlclang++ --- src/common/KokkosKernels_BitUtils.hpp | 61 ++++++++++++++++++++------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/src/common/KokkosKernels_BitUtils.hpp b/src/common/KokkosKernels_BitUtils.hpp index 3dc78f77b1..c845e37c53 100644 --- a/src/common/KokkosKernels_BitUtils.hpp +++ b/src/common/KokkosKernels_BitUtils.hpp @@ -112,66 +112,95 @@ int pop_count( long long i ){ return _popcnt64(i); } -#elif defined( KOKKOS_COMPILER_IBM ) +#elif defined( __GNUC__ ) || defined( __GNUG__ ) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __popcnt4(i); + return __builtin_popcount(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __popcnt8(i); + return __builtin_popcountl(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __popcnt8(i); + return __builtin_popcountll(i); } +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( int i ){ + return __builtin_popcount(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long i ){ + return __builtin_popcountl(i); +} + +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( long long i ){ + return __builtin_popcountll(i); +} +#elif defined(__ibmxl_vrm__) +// See https://www.ibm.com/support/knowledgecenter/SSGH3R_16.1.0/com.ibm.xlcpp161.aix.doc/compiler_ref/compiler_builtins.html +// link gives info about builtin names for xlclang++ +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned i ){ + return __builtin_popcnt4(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long i ){ + return __builtin_popcnt8(i); +} +KOKKOS_FORCEINLINE_FUNCTION +int pop_count( unsigned long long i ){ + return __builtin_popcnt8(i); +} KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __popcnt4(i); + return __builtin_popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( long long i ){ - return __popcnt8(i); + return __builtin_popcnt8(i); } -#elif defined( __GNUC__ ) || defined( __GNUG__ ) +#elif defined(__IBMCPP__) || defined(__IBMC__) KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned i ){ - return __builtin_popcount(i); + return __popcnt4(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long i ){ - return __builtin_popcountl(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( unsigned long long i ){ - return __builtin_popcountll(i); + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION int pop_count( int i ){ - return __builtin_popcount(i); + return __popcnt4(i); } + KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long i ){ - return __builtin_popcountl(i); +int pop_count( long i ){ + return __popcnt8(i); } KOKKOS_FORCEINLINE_FUNCTION -int pop_count( long long i ){ - return __builtin_popcountll(i); +int pop_count( long long i ){ + return __popcnt8(i); } #else From 4ff54c6159a72fe3e67cbf6db0400b4bb5de0a3e Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 15 Dec 2020 07:35:22 -0700 Subject: [PATCH 007/126] Adding a smoke test using some CMake logic The new test needs Kokkos Kernels to be installed first. I am not sure how to check that we have performed installation with CMake but if that can be done then the test could be guarded so that it does not fail when an install is not performed. The test also needs "cmake" to be a valid command (i.e. cmake is in $PATH), that seems like a reasonable assumption given that we use CMake to configure the package... --- CMakeLists.txt | 2 ++ smoke_test/CMakeLists.txt | 13 +++++++++++++ smoke_test/CMakeLists.txt.in | 7 +++++++ smoke_test/run_smoke_test.sh.in | 11 +++++++++++ 4 files changed, 33 insertions(+) create mode 100644 smoke_test/CMakeLists.txt create mode 100644 smoke_test/CMakeLists.txt.in create mode 100755 smoke_test/run_smoke_test.sh.in diff --git a/CMakeLists.txt b/CMakeLists.txt index 5196745b5c..abb00c7de8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,6 +79,7 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(smoke_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ELSE() # Regular build, not install testing @@ -199,6 +200,7 @@ ELSE() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) + KOKKOSKERNELS_ADD_TEST_DIRECTORIES(smoke_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) KOKKOSKERNELS_PACKAGE_POSTPROCESS() diff --git a/smoke_test/CMakeLists.txt b/smoke_test/CMakeLists.txt new file mode 100644 index 0000000000..68518240b7 --- /dev/null +++ b/smoke_test/CMakeLists.txt @@ -0,0 +1,13 @@ +# First copy the CMakeList.txt so we can build the test +file(COPY ${PACKAGE_SOURCE_DIR}/smoke_test/CMakeLists.txt.in DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) +file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt.in ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt) + +# Second copy the source files needed to the build area +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_pcg.hpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_block_pcg.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) + +# Third write a configure file that can be invoked to test the library installation +configure_file(${PACKAGE_SOURCE_DIR}/smoke_test/run_smoke_test.sh.in ${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh @ONLY) + +add_test(NAME SMOKE_TEST COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh) +# KOKKOSKERNELS_ADD_TEST(NAME "SMOKE_TEST" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh" COMPONENTS Sparse) diff --git a/smoke_test/CMakeLists.txt.in b/smoke_test/CMakeLists.txt.in new file mode 100644 index 0000000000..b877decb3a --- /dev/null +++ b/smoke_test/CMakeLists.txt.in @@ -0,0 +1,7 @@ +cmake_minimum_required(VERSION 3.13) +project(kokkoskernels_smoke_test CXX) + +find_package(KokkosKernels REQUIRED) + +add_executable(kokkoskernels_smoke_test KokkosSparse_block_pcg.cpp) +target_link_libraries(kokkoskernels_smoke_test PRIVATE Kokkos::kokkoskernels) diff --git a/smoke_test/run_smoke_test.sh.in b/smoke_test/run_smoke_test.sh.in new file mode 100755 index 0000000000..bcc9fbc178 --- /dev/null +++ b/smoke_test/run_smoke_test.sh.in @@ -0,0 +1,11 @@ +#!/bin/bash + +KOKKOSKERNELS_INTALL="@CMAKE_BINARY_DIR@" +SMOKE_TEST_SOURCE="@CMAKE_CURRENT_BINARY_DIR@/source" +SMOKE_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" + +cd "${SMOKE_TEST_BUILD}" +rm -rf CMake* + +cmake "${SMOKE_TEST_SOURCE}" \ + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" From 211e1dde453b2e6c950f11b99c8c1cd073e60249 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 15 Dec 2020 08:56:45 -0700 Subject: [PATCH 008/126] Make tpl avail type combinations match tpl decl *_tpl_spec_decl.hpp files instantiate all type combinations allowed by the TPL, even if that is not a KokkosKernels ETI type combination. *_tpl_spec_avail.hpp was restricting to ETI type combinations, meaning that TPLs weren't getting called even if they had been instantiated in the library. Fix the spec_avail files to match the spec_decl files. --- .../tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp | 16 -- .../tpls/KokkosBlas1_dot_tpl_spec_avail.hpp | 16 -- .../tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp | 26 --- .../tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp | 16 -- .../tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp | 17 -- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 16 -- .../tpls/KokkosBlas1_scal_tpl_spec_avail.hpp | 26 --- .../tpls/KokkosBlas1_scal_tpl_spec_decl.hpp | 2 - .../tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp | 48 ----- .../tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp | 48 ----- .../tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp | 48 ----- .../tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp | 48 ----- .../tpls/KokkosBlas_gesv_tpl_spec_avail.hpp | 24 --- .../tpls/KokkosBlas_trtri_tpl_spec_avail.hpp | 24 --- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 190 ------------------ 15 files changed, 565 deletions(-) diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 217e6f4939..c0e58b19b3 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 135dcc6d1b..d3b0fabd71 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 8df32f62d8..182aba3115 100644 --- a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -104,41 +96,23 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 3ff2cf4703..5a44212e67 100644 --- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +89,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 3dd558ccd1..3facb0c245 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -57,7 +57,6 @@ struct nrm2_tpl_spec_avail { namespace KokkosBlas { namespace Impl { - // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double @@ -70,18 +69,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +88,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 1ebf2e2f40..16d22e7b02 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +89,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 2b92355dd9..114923cca7 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -71,18 +71,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -99,33 +91,15 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 579d9b81a5..d866702f4f 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -376,7 +376,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, f KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -#if defined (KOKKOS_ENABLE_CUDA_UVM) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) @@ -388,7 +387,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -#endif } } diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index c1fd67f9ea..5c6d1734dc 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,39 +93,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 49f6fe743c..3b21c0e8a7 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,47 +93,23 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index bce7cb5f5d..03e2badcc1 100644 --- a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS @@ -114,47 +90,23 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS } // namespace Impl diff --git a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index 93808e3eb0..29a04fb715 100644 --- a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -114,47 +90,23 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp index 917a55fec4..e25a9aa3f1 100644 --- a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp @@ -66,22 +66,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) @@ -114,22 +102,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index fa651f531f..4b602bd765 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -78,55 +78,31 @@ KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( SCALAR , LAYOUTA, MEMSPACE ) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif } // namespace Impl } // namespace KokkosBlas diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index abeab8c214..345b0b013c 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -71,117 +71,22 @@ struct spmv_tpl_spec_avail, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif //CUDA_VERSION by itself cannot determine whether the generic cuSPARSE API is available: //cuSPARSE version 10.1.105 does not have the generic API, but it comes with the same CUDA_VERSION (10010) as 10.1.243 which does. @@ -190,117 +95,22 @@ struct spmv_tpl_spec_avail, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // CUSPARSE >= 10.3 (nested, implies >= 9.0) #endif // CUDA/CUSPARSE >= 9.0? From ec2b7a8091f882df6b834e2fd26a3390e34ccc37 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 17 Dec 2020 15:54:42 -0500 Subject: [PATCH 009/126] Add Dockerfile to build an image for HIP --- scripts/docker/Dockerfile.hip | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 scripts/docker/Dockerfile.hip diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip new file mode 100644 index 0000000000..2db14b1009 --- /dev/null +++ b/scripts/docker/Dockerfile.hip @@ -0,0 +1,28 @@ +ARG BASE=rocm/dev-ubuntu-20.04:3.10 +FROM $BASE + +RUN apt-get update && apt-get install -y \ + git \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH=/opt/rocm/bin:$PATH + +ARG CMAKE_VERSION=3.18.5 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH From e3bff613aaac87808cff50f1d3655873a5fa1707 Mon Sep 17 00:00:00 2001 From: Damien L-G Date: Thu, 17 Dec 2020 15:57:10 -0500 Subject: [PATCH 010/126] Add Jenkinsfile for nightly HIP build --- .jenkins/nightly.groovy | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .jenkins/nightly.groovy diff --git a/.jenkins/nightly.groovy b/.jenkins/nightly.groovy new file mode 100644 index 0000000000..41e4daf71e --- /dev/null +++ b/.jenkins/nightly.groovy @@ -0,0 +1,41 @@ +pipeline { + agent none + + stages { + stage('HIP-ROCm-3.10-C++14') { + agent { + dockerfile { + filename 'Dockerfile.hip' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.10' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh '''rm -rf kokkos && + git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA906=ON \ + .. && \ + make -j8 && make install && \ + cd ../.. && rm -rf kokkos''' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + } + } +} From ae6c978cf028bb07cb958b1950cd47f927d6d865 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Fri, 18 Dec 2020 15:43:54 -0700 Subject: [PATCH 011/126] cm_test_all_sandia: update cmake on systems with 3.17 cm_generate_makefile: replace Kokkos_CXX_STANDARD with CMAKE_CXX_STANDARD --- cm_generate_makefile.bash | 2 +- scripts/cm_test_all_sandia | 80 +++++++++++++++++++------------------- 2 files changed, 41 insertions(+), 41 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index bb246df3c6..82e854e07d 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -569,7 +569,7 @@ done if [ "$KOKKOS_CXX_STANDARD" == "" ]; then STANDARD_CMD= else - STANDARD_CMD=-DKokkos_CXX_STANDARD=${KOKKOS_CXX_STANDARD} + STANDARD_CMD=-DCMAKE_CXX_STANDARD=${KOKKOS_CXX_STANDARD} fi if [ "$COMPILER" == "" ]; then diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index c5f7148125..f2cdc4dbb6 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -445,12 +445,12 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh" eval "$MODULE_ENVIRONMENT" - module load sems-cmake/3.12.2 - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/9.2.0" - CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-cuda/9.2" + module load sems-cmake/3.17.1 + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/9.2.0" + CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-cuda/9.2" SKIP_HWLOC=True if [ -z "$ARCH_FLAG" ]; then @@ -502,16 +502,16 @@ elif [ "$MACHINE" = "white" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.18.0,/" + IBM_MODULE_LIST="cmake/3.18.0,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.18.0,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.18.0,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,netlib/3.8.0/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.3.4/gcc/7.4.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" - IBM_MODULE_TPL_LIST="cmake/3.12.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" + GCC72_MODULE_TPL_LIST="cmake/3.18.0,/,netlib/3.8.0/gcc/7.2.0" + GCC74_MODULE_TPL_LIST="cmake/3.18.0,/,openblas/0.3.4/gcc/7.4.0" + CUDA_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" + IBM_MODULE_TPL_LIST="cmake/3.18.0,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -555,14 +555,14 @@ elif [ "$MACHINE" = "weaver" ]; then eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.18.0,/" + IBM_MODULE_LIST="cmake/3.18.0,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.18.0,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.18.0,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.2.20/gcc/7.2.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + GCC72_MODULE_TPL_LIST="cmake/3.18.0,/,openblas/0.2.20/gcc/7.2.0" + CUDA_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS @@ -650,7 +650,7 @@ elif [ "$MACHINE" = "caraway" ]; then HIPCLANG_WARNING_FLAGS="" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/3.8.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") + COMPILERS=("rocm/3.10.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA900" @@ -707,21 +707,21 @@ elif [ "$MACHINE" = "apollo" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb module load binutils SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA101_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA101_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" -# HPX_MODULE_LIST="sems-env,sems-cmake/3.12.2,hpx/1.2.1,sems-gcc/6.1.0,binutils" -# HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/9.0.69" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" +# HPX_MODULE_LIST="sems-env,sems-cmake/3.17.1,hpx/1.2.1,sems-gcc/6.1.0,binutils" +# HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" @@ -761,19 +761,19 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,/" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" - NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/9.2.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,/" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" + NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/9.2.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/10.0" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/10.0" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" From 97aa966ccdcc7bf2796eef8340353ccd520ecc71 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Wed, 16 Dec 2020 11:23:43 -0700 Subject: [PATCH 012/126] Adding Changelog for Release 3.3.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.3 (cherry picked from commit fcebfb0472fc75cb6d6c93f5b8782c68cb5145c0) --- CHANGELOG.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a9a1ff7a2..51e31ef007 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,34 @@ # Change Log +## [3.3.00](https://github.com/kokkos/kokkos-kernels/tree/3.3.00) (2020-12-16) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.01...3.3.00) + +**Implemented enhancements:** +- Add permanent RCM reordering interface, and a basic serial implementation [\#854](https://github.com/kokkos/kokkos/pull/#854) +- Half\_t explicit conversions [\#849](https://github.com/kokkos/kokkos/pull/#849) +- Add batched gemm performance tests [\#838](https://github.com/kokkos/kokkos/pull/#838) +- Add HIP support to src and perf\_test [\#828](https://github.com/kokkos/kokkos/pull/#828) +- Factor out coarsening [\#827](https://github.com/kokkos/kokkos/pull/#827) +- Allow enabling/disabling components at configuration time [\#823](https://github.com/kokkos/kokkos/pull/#823) +- HIP: CMake work on tests and ETI [\#820](https://github.com/kokkos/kokkos/pull/#820) +- HIP: KokkosBatched - hip specialization [\#812](https://github.com/kokkos/kokkos/pull/#812) +- Distance-2 maximal independent set [\#801](https://github.com/kokkos/kokkos/pull/#801) +- Use batched TRTRI & TRMM for Supernode-sptrsv setup [\#797](https://github.com/kokkos/kokkos/pull/#797) +- Initial support for half precision [\#794](https://github.com/kokkos/kokkos/pull/#794) + +**Fixed bugs:** +- Fix issue with HIP and Kokkos\_ArithTraits [\#844](https://github.com/kokkos/kokkos/pull/#844) +- HIP: fixing round of issues on AMD [\#840](https://github.com/kokkos/kokkos/pull/#840) +- Throw an exception if BLAS GESV is not enabled [\#837](https://github.com/kokkos/kokkos/pull/#837) +- Fixes -Werror for gcc with c++20 [\#836](https://github.com/kokkos/kokkos/pull/#836) +- Add fallback condition to use spmv\_native when cuSPARSE does not work [\#834](https://github.com/kokkos/kokkos/pull/#834) +- Fix install testing refactor for inline builds [\#811](https://github.com/kokkos/kokkos/pull/#811) +- HIP: fix ArithTraits to support HIP backend [\#809](https://github.com/kokkos/kokkos/pull/#809) +- cuSPARSE 11: fix spgemm and spmv\_struct\_tunning compilation error [\#804](https://github.com/kokkos/kokkos/pull/#804) + +**Incompatibilities:** +- Remove pre-3.0 deprecated code [\#825](https://github.com/kokkos/kokkos/pull/#825) + ## [3.2.01](https://github.com/kokkos/kokkos-kernels/tree/3.2.01) (2020-11-17) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.00...3.2.01) From 7c1a82de3aa027ceb760bd7e608f061632d07e72 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 21 Dec 2020 12:25:43 -0700 Subject: [PATCH 013/126] Update MINOR and PATCH version --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5196745b5c..209db7ce6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 2) - SET(KokkosKernels_VERSION_PATCH 1) + SET(KokkosKernels_VERSION_MINOR 3) + SET(KokkosKernels_VERSION_PATCH 0) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") From 08eecbaa0b85d2832ec971d52de19ef7d5cafd1d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 21 Dec 2020 14:24:53 -0700 Subject: [PATCH 014/126] cm_test_all_sandia: update mayer cmake version --- scripts/cm_test_all_sandia | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index f2cdc4dbb6..54f4bd4621 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -624,8 +624,7 @@ elif [ "$MACHINE" = "mayer" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=96 - BASE_MODULE_LIST="cmake/3.14.5,/" -# ARM_MODULE_LIST="cmake/3.12.2,/" + BASE_MODULE_LIST="cmake/3.17.1,/" ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" From 6d8011c81685f425e95c932398b26742ff3c601c Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Fri, 1 Jan 2021 21:44:44 -0700 Subject: [PATCH 015/126] smoke_test: upgrading in accordance to PR comments Adding CMake option to turn on or off install test. --- CMakeLists.txt | 6 ++++-- install_test/CMakeLists.txt | 15 +++++++++++++++ install_test/CMakeLists.txt.in | 13 +++++++++++++ install_test/run_install_test.sh.in | 13 +++++++++++++ smoke_test/CMakeLists.txt | 13 ------------- smoke_test/CMakeLists.txt.in | 7 ------- smoke_test/run_smoke_test.sh.in | 11 ----------- 7 files changed, 45 insertions(+), 33 deletions(-) create mode 100644 install_test/CMakeLists.txt create mode 100644 install_test/CMakeLists.txt.in create mode 100755 install_test/run_install_test.sh.in delete mode 100644 smoke_test/CMakeLists.txt delete mode 100644 smoke_test/CMakeLists.txt.in delete mode 100755 smoke_test/run_smoke_test.sh.in diff --git a/CMakeLists.txt b/CMakeLists.txt index abb00c7de8..0a9c8c331a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,7 +79,6 @@ IF (KokkosKernels_INSTALL_TESTING) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(smoke_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) ELSE() # Regular build, not install testing @@ -197,10 +196,13 @@ ELSE() MESSAGE("") # Skip building Kokkos Kernels if we are doing an installation test ADD_SUBDIRECTORY(src) + IF(KokkosKernels_ENABLE_INSTALL_TEST) + ADD_SUBDIRECTORY(install_test) + MESSAGE("The install test has been enabled, you will need to peform: make install before running the tests otherwise install_test will fail") + ENDIF() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) - KOKKOSKERNELS_ADD_TEST_DIRECTORIES(smoke_test) KOKKOSKERNELS_ADD_EXAMPLE_DIRECTORIES(example) KOKKOSKERNELS_PACKAGE_POSTPROCESS() diff --git a/install_test/CMakeLists.txt b/install_test/CMakeLists.txt new file mode 100644 index 0000000000..4be641e87a --- /dev/null +++ b/install_test/CMakeLists.txt @@ -0,0 +1,15 @@ +# First copy the CMakeList.txt so we can build the test +configure_file(${PACKAGE_SOURCE_DIR}/install_test/CMakeLists.txt.in ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt) + +# Second copy the source files needed to the build area +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_pcg.hpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_block_pcg.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) + +# Third write a configure file that can be invoked to test the library installation +configure_file(${PACKAGE_SOURCE_DIR}/install_test/run_install_test.sh.in ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh @ONLY) + +# Fourth create the build directory where the installation of the cg example will take place +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build) + +add_test(NAME install_test COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh) +# KOKKOSKERNELS_ADD_TEST(NAME "install_test" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh" COMPONENTS Sparse) diff --git a/install_test/CMakeLists.txt.in b/install_test/CMakeLists.txt.in new file mode 100644 index 0000000000..92f41059f5 --- /dev/null +++ b/install_test/CMakeLists.txt.in @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.13) +project(kokkoskernels_install_test CXX) + +include(CTest) + +find_package(KokkosKernels REQUIRED) + +add_executable(kokkoskernels_install_test KokkosSparse_block_pcg.cpp) +target_link_libraries(kokkoskernels_install_test PRIVATE Kokkos::kokkoskernels) + +if(BUILD_TESTING) + add_test(NAME cg_test COMMAND kokkoskernels_install_test) +endif() diff --git a/install_test/run_install_test.sh.in b/install_test/run_install_test.sh.in new file mode 100755 index 0000000000..118a599bd9 --- /dev/null +++ b/install_test/run_install_test.sh.in @@ -0,0 +1,13 @@ +#!/bin/bash + +KOKKOSKERNELS_INTALL="@CMAKE_BINARY_DIR@" +INSTALL_TEST_SOURCE="@CMAKE_CURRENT_BINARY_DIR@/source" +INSTALL_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" + +cd "${INSTALL_TEST_BUILD}" +rm -rf CMake* + +cmake "${INSTALL_TEST_SOURCE}" \ + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" | tee configure_install_test.log + +make -j 4 | tee make_install_test.log diff --git a/smoke_test/CMakeLists.txt b/smoke_test/CMakeLists.txt deleted file mode 100644 index 68518240b7..0000000000 --- a/smoke_test/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -# First copy the CMakeList.txt so we can build the test -file(COPY ${PACKAGE_SOURCE_DIR}/smoke_test/CMakeLists.txt.in DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) -file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt.in ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt) - -# Second copy the source files needed to the build area -file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_pcg.hpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) -file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_block_pcg.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) - -# Third write a configure file that can be invoked to test the library installation -configure_file(${PACKAGE_SOURCE_DIR}/smoke_test/run_smoke_test.sh.in ${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh @ONLY) - -add_test(NAME SMOKE_TEST COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh) -# KOKKOSKERNELS_ADD_TEST(NAME "SMOKE_TEST" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/run_smoke_test.sh" COMPONENTS Sparse) diff --git a/smoke_test/CMakeLists.txt.in b/smoke_test/CMakeLists.txt.in deleted file mode 100644 index b877decb3a..0000000000 --- a/smoke_test/CMakeLists.txt.in +++ /dev/null @@ -1,7 +0,0 @@ -cmake_minimum_required(VERSION 3.13) -project(kokkoskernels_smoke_test CXX) - -find_package(KokkosKernels REQUIRED) - -add_executable(kokkoskernels_smoke_test KokkosSparse_block_pcg.cpp) -target_link_libraries(kokkoskernels_smoke_test PRIVATE Kokkos::kokkoskernels) diff --git a/smoke_test/run_smoke_test.sh.in b/smoke_test/run_smoke_test.sh.in deleted file mode 100755 index bcc9fbc178..0000000000 --- a/smoke_test/run_smoke_test.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -KOKKOSKERNELS_INTALL="@CMAKE_BINARY_DIR@" -SMOKE_TEST_SOURCE="@CMAKE_CURRENT_BINARY_DIR@/source" -SMOKE_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" - -cd "${SMOKE_TEST_BUILD}" -rm -rf CMake* - -cmake "${SMOKE_TEST_SOURCE}" \ - -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" From 94761a00d7805d8bf12efadc39d6761d697795c7 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Wed, 6 Jan 2021 22:01:50 -0700 Subject: [PATCH 016/126] Fixing seg fault with empty matrix --- src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 4df18eb833..89c6f81a2c 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -97,8 +97,10 @@ void level_sched ( IlukHandle& thandle, level_ptr(level_list(i)-1) += 1; } - for ( size_type i = nlevels-1; i > 0; --i ) { - level_ptr(i) = level_ptr(i-1); + if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0. + for ( size_type i = nlevels-1; i > 0; --i ) { + level_ptr(i) = level_ptr(i-1); + } } level_ptr(0) = 0; From 5aea88b91bc9a2e8536bb967124a8d8ee7e21ba8 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 8 Jan 2021 17:20:50 -0700 Subject: [PATCH 017/126] Fix >1024 team size error in sort_crs_* Also add a test that replicated the issue. --- src/common/KokkosKernels_SparseUtils.hpp | 38 +++++++++++++----------- test_common/Test_Common_Sorting.hpp | 21 ++++++++----- 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 6979f15847..85763608ec 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1042,9 +1042,11 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if(useRadix) { Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct); @@ -1054,16 +1056,15 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val //Try to get teamsize to be largest power of 2 not greater than avg entries per row //TODO (probably important for performnce): add thread-level sort also, and use that //for small avg degree. But this works for now. - int teamSize = 1; - lno_t avgDeg = 0; - if(numRows) - avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(teamSize * 2 * 2 <= avgDeg) + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) { - teamSize *= 2; + idealTeamSize *= 2; } - team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelForTag())); + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); } } @@ -1090,9 +1091,11 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; SortCrsGraphFunctor funct(useRadix, rowmap, entries); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if(useRadix) { Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy(0, numRows), funct); @@ -1103,16 +1106,15 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) //half the entries per row. 0.5 * #entries is bitonic's parallelism within a row. //TODO (probably important for performnce): add thread-level sort also, and use that //for small avg degree. But this works for now. - int teamSize = 1; - lno_t avgDeg = 0; - if(numRows) - avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(teamSize * 2 * 2 <= avgDeg) + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) { - teamSize *= 2; + idealTeamSize *= 2; } - team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelForTag())); + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); } } diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index 3a98c1f420..732ee4b451 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -544,7 +544,7 @@ void testBitonicSortLexicographic() } template -void testSortCRS(default_lno_t numRows, default_size_type nnz, bool doValues) +void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues) { using scalar_t = default_scalar; using lno_t = default_lno_t; @@ -559,7 +559,7 @@ void testSortCRS(default_lno_t numRows, default_size_type nnz, bool doValues) //IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this //wouldn't test anything crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix - (numRows, numRows, nnz, 2, numRows / 2); + (numRows, numCols, nnz, 2, numCols / 2); auto rowmap = A.graph.row_map; auto entries = A.graph.entries; auto values = A.values; @@ -774,15 +774,20 @@ TEST_F( TestCategory, common_device_bitonic) { } TEST_F( TestCategory, common_sort_crsgraph) { - testSortCRS(10, 20, false); - testSortCRS(100, 2000, false); - testSortCRS(1000, 30000, false); + testSortCRS(10, 10, 20, false); + testSortCRS(100, 100, 2000, false); + testSortCRS(1000, 1000, 30000, false); } TEST_F( TestCategory, common_sort_crsmatrix) { - testSortCRS(10, 20, true); - testSortCRS(100, 2000, true); - testSortCRS(1000, 30000, true); + testSortCRS(10, 10, 20, true); + testSortCRS(100, 100, 2000, true); + testSortCRS(1000, 1000, 30000, true); +} + +TEST_F( TestCategory, common_sort_crs_longrows) { + testSortCRS(1, 50000, 10000, false); + testSortCRS(1, 50000, 10000, true); } TEST_F( TestCategory, common_sort_merge_crsmatrix) { From 887546e2178a0d4c800edab841bb3fed180368b1 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 12 Jan 2021 11:55:32 -0700 Subject: [PATCH 018/126] Smoke Test: the test now works fully The new test is guarded with KokkosKernels_ENABLE_INSTALL_TEST. When enabled it copies source files for the block pcg test, configures that test against the installation of Kokkos Kernels, compiles and runs the test, reports any failures. --- install_test/CMakeLists.txt.in | 20 +- install_test/run_install_test.sh.in | 27 ++- perf_test/sparse/KokkosSparse_block_pcg.cpp | 214 ++++++++++---------- 3 files changed, 148 insertions(+), 113 deletions(-) diff --git a/install_test/CMakeLists.txt.in b/install_test/CMakeLists.txt.in index 92f41059f5..74605ac73f 100644 --- a/install_test/CMakeLists.txt.in +++ b/install_test/CMakeLists.txt.in @@ -9,5 +9,23 @@ add_executable(kokkoskernels_install_test KokkosSparse_block_pcg.cpp) target_link_libraries(kokkoskernels_install_test PRIVATE Kokkos::kokkoskernels) if(BUILD_TESTING) - add_test(NAME cg_test COMMAND kokkoskernels_install_test) + + add_test(NAME cg_test_serial COMMAND kokkoskernels_install_test --mtx auto --serial) + + if(KOKKOS_ENABLE_THREADS) + add_test(NAME cg_test_threads COMMAND kokkoskernels_install_test --mtx auto --threads 2) + endif() + + if(KOKKOS_ENABLE_OPENMP) + add_test(NAME cg_test_openmp COMMAND kokkoskernels_install_test --mtx auto --openmp 2) + endif() + + if(KOKKOS_ENABLE_CUDA) + add_test(NAME cg_test_cuda COMMAND kokkoskernels_install_test --mtx auto --cuda) + endif() + + if(KOKKOS_ENABLE_HIP) + add_test(NAME cg_test_hip COMMAND kokkoskernels_install_test --mtx auto --hip) + endif() + endif() diff --git a/install_test/run_install_test.sh.in b/install_test/run_install_test.sh.in index 118a599bd9..0313ee954b 100755 --- a/install_test/run_install_test.sh.in +++ b/install_test/run_install_test.sh.in @@ -8,6 +8,29 @@ cd "${INSTALL_TEST_BUILD}" rm -rf CMake* cmake "${INSTALL_TEST_SOURCE}" \ - -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" | tee configure_install_test.log + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" -make -j 4 | tee make_install_test.log +if [ $? -eq 0 ]; then + echo "*** install test: cmake configure SUCCESSFUL ***" +else + echo "*** install test: cmake configure FAILED ***" + exit 1; +fi + +make -j 4 + +if [ $? -eq 0 ]; then + echo "*** install test: build SUCCESSFUL ***" +else + echo "*** install test: build FAILED ***" + exit 1; +fi + +ctest -V -R + +if [ $? -eq 0 ]; then + echo "*** install test: run SUCCESSFUL ***" +else + echo "*** install test: run FAILED ***" + exit 1; +fi diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 4f990f98ca..38e0b4ec33 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -60,6 +60,69 @@ unsigned cg_iteration_limit = 10; +template +crsMat_t create_crs_matrix(char *mtx_bin_file) { + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using row_map_view_t = typename graph_t::row_map_type::non_const_type; + using cols_view_t = typename graph_t::entries_type::non_const_type; + using values_view_t = typename crsMat_t::values_type::non_const_type; + using myExecSpace = typename crsMat_t::execution_space; + + crsMat_t crsmat; + + printf("matrix file: %s\n", mtx_bin_file); + + if(std::string(mtx_bin_file) == "auto") { + INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40; + crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(num_rows, num_cols, nnz, 3, 5); + printf("generating test matrix automatically\n"); + printf(" num rows: %d", num_rows); + printf(" num cols: %d", num_cols); + printf(" num non zeros: %d\n", nnz); + } else { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); + + row_map_view_t rowmap_view ("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view ("values_view", ne); + + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (INDEX_TYPE i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + for (INDEX_TYPE i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } else { + KokkosKernels::Impl::copy_vector(ne, ew, values_view); + KokkosKernels::Impl::copy_vector(ne, adj, columns_view); + KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + } + + graph_t static_graph (columns_view, rowmap_view); + crsmat = crsMat_t("CrsMatrix", nv, values_view, static_graph); + delete [] xadj; + delete [] adj; + delete [] ew; + } + + return crsmat; +} + template scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ @@ -338,10 +401,11 @@ enum { CMD_USE_THREADS = 0 , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA , CMD_USE_OPENMP + , CMD_USE_SERIAL , CMD_USE_CUDA_DEV , CMD_BIN_MTX , CMD_ERROR - , CMD_COUNT }; + , CMD_COUNT}; int main (int argc, char ** argv){ @@ -355,7 +419,10 @@ int main (int argc, char ** argv){ for ( int i = 1 ; i < argc ; ++i ) { - if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { + if ( 0 == strcasecmp( argv[i] , "--serial" ) ) { + cmdline[ CMD_USE_SERIAL ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { kargs.num_threads = cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] ); } else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { @@ -380,14 +447,14 @@ int main (int argc, char ** argv){ std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + return 1; } } if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + std::cerr << "Provide a mtx binary file or specify auto-generation" << std::endl ; + std::cerr << "OPTIONS\n\t--serial\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file|auto]" << std::endl; + return 1; } std::cout << "Running experiments with block size:" << block_size << std::endl; @@ -395,39 +462,37 @@ int main (int argc, char ** argv){ Kokkos::initialize(kargs); -#if defined( KOKKOS_ENABLE_THREADS ) +#if defined( KOKKOS_ENABLE_SERIAL ) - if ( cmdline[ CMD_USE_THREADS ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + if ( cmdline[ CMD_USE_SERIAL ] ) { + using myExecSpace = Kokkos::Serial; + Kokkos::Serial::print_configuration(std::cout); + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - Kokkos::Threads::print_configuration(std::cout); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; + using values_view_t = typename crsMat_t::values_type::non_const_type; + values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); + for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ + kok_x_original(i) = 0; + } + run_experiment(crsmat, kok_x_original, block_size); + } - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; +#endif - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); +#if defined( KOKKOS_ENABLE_THREADS ) - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + if ( cmdline[ CMD_USE_THREADS ] ) { + using myExecSpace = Kokkos::Threads; + Kokkos::Threads::print_configuration(std::cout); - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; @@ -440,47 +505,18 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_OPENMP ) if ( cmdline[ CMD_USE_OPENMP ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - + using myExecSpace = Kokkos::OpenMP; Kokkos::OpenMP::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; } run_experiment(crsmat, kok_x_original, block_size); - } #endif @@ -488,57 +524,15 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_CUDA ) if ( cmdline[ CMD_USE_CUDA ] ) { // Use the last device: - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + using myExecSpace = Kokkos::Cuda myExecSpace; Kokkos::Cuda::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); run_experiment(crsmat, kok_x_original, block_size); - - } #endif From ceb15c59157b8466daeb40067236585dbb72fdb7 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 12 Jan 2021 17:52:50 -0800 Subject: [PATCH 019/126] Smoke Test: fixing bug in Cuda code branch and forwarding Kokkos compiler --- install_test/run_install_test.sh.in | 5 ++++- perf_test/sparse/KokkosSparse_block_pcg.cpp | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/install_test/run_install_test.sh.in b/install_test/run_install_test.sh.in index 0313ee954b..b8c51d2985 100755 --- a/install_test/run_install_test.sh.in +++ b/install_test/run_install_test.sh.in @@ -7,8 +7,11 @@ INSTALL_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" cd "${INSTALL_TEST_BUILD}" rm -rf CMake* +echo "Kokkos at: @CMAKE_CXX_COMPILER@" + cmake "${INSTALL_TEST_SOURCE}" \ - -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib/cmake/KokkosKernels" + -D CMAKE_CXX_COMPILER="@CMAKE_CXX_COMPILER@" \ + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib64/cmake/KokkosKernels" if [ $? -eq 0 ]; then echo "*** install test: cmake configure SUCCESSFUL ***" diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 38e0b4ec33..f4833cda17 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -510,6 +510,7 @@ int main (int argc, char ** argv){ using crsMat_t = typename KokkosSparse::CrsMatrix; crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); @@ -524,11 +525,12 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_CUDA ) if ( cmdline[ CMD_USE_CUDA ] ) { // Use the last device: - using myExecSpace = Kokkos::Cuda myExecSpace; + using myExecSpace = Kokkos::Cuda; Kokkos::Cuda::print_configuration(std::cout); using crsMat_t = typename KokkosSparse::CrsMatrix; crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); From 384cdb2042b21e96f437dcae5463084ec06767a5 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 12 Jan 2021 18:23:49 -0800 Subject: [PATCH 020/126] Smoke Test: adding variable to pick correct install dir for Kokkos Kernels --- install_test/run_install_test.sh.in | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/install_test/run_install_test.sh.in b/install_test/run_install_test.sh.in index b8c51d2985..a3b0fd6a59 100755 --- a/install_test/run_install_test.sh.in +++ b/install_test/run_install_test.sh.in @@ -7,11 +7,9 @@ INSTALL_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" cd "${INSTALL_TEST_BUILD}" rm -rf CMake* -echo "Kokkos at: @CMAKE_CXX_COMPILER@" - cmake "${INSTALL_TEST_SOURCE}" \ -D CMAKE_CXX_COMPILER="@CMAKE_CXX_COMPILER@" \ - -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/lib64/cmake/KokkosKernels" + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/@CMAKE_INSTALL_LIBDIR@/cmake/KokkosKernels" if [ $? -eq 0 ]; then echo "*** install test: cmake configure SUCCESSFUL ***" From cd4eefa6fe013149a56b36c47b05bb396587c85d Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 18 Jan 2021 18:06:33 -0700 Subject: [PATCH 021/126] Move spmv fallback branch out of TPL layer This was causing the entire native impl to get instantiated once every time KokkosSparse_spmv.hpp was included. --- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 41 ++--------- src/sparse/KokkosSparse_spmv.hpp | 72 ++++++++++++++----- 2 files changed, 60 insertions(+), 53 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index ced3476539..f12fe7a16b 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -50,35 +50,10 @@ #include "cusparse.h" #include "KokkosKernels_SparseUtils_cusparse.hpp" #include "KokkosKernels_Controls.hpp" -#include "KokkosSparse_spmv_impl.hpp" namespace KokkosSparse { namespace Impl { - template - void spmv_native(const KokkosKernels::Experimental::Controls& controls, - const char mode[], - typename YVector::non_const_value_type const & alpha, - const AMatrix& A, - const XVector& x, - typename YVector::non_const_value_type const & beta, - const YVector& y) { - using KAT = Kokkos::Details::ArithTraits; - - if (beta == KAT::zero ()) { - KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); - } - else if (beta == KAT::one ()) { - KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); - } - else if (beta == -KAT::one ()) { - KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); - } - else { - KokkosSparse::Impl::spmv_beta (controls, mode, alpha, A, x, beta, y); - } - } - template void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, const char mode[], @@ -265,18 +240,10 @@ namespace Impl { const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - bool fallback = *mode == 'C' || ((*mode == 'T' || *mode == 'H') && 9000 <= CUDA_VERSION && CUDA_VERSION < 10000); \ - if((controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native") || fallback) { \ - std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_native(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } else { \ - std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ - Kokkos::Profiling::pushRegion(label); \ - spmv_cusparse(controls, mode, alpha, A, x, beta, y); \ - Kokkos::Profiling::popRegion(); \ - } \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_cusparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 4c26f5cd6e..ca83cb217b 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -151,23 +151,63 @@ spmv (KokkosKernels::Experimental::Controls controls, KokkosBlas::scal(y_i, beta, y_i); return; } - Impl::SPMV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits> - ::spmv (controls, mode, alpha, A_i, x_i, beta, y_i); -} + //Whether to call KokkosKernel's native implementation, even if a TPL impl is available + bool useFallback = controls.isParameter("algorithm") && controls.getParameter("algorithm") == "native"; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + //cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only supports the normal (N) mode. +#if (9000 <= CUDA_VERSION) + useFallback = useFallback || (mode[0] != NoTranspose[0]); +#endif +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + useFallback = useFallback || (mode[0] == Conjugate[0]); +#endif +#endif + + if(useFallback) + { + //Explicitly call the non-TPL SPMV implementation + std::string label = "KokkosSparse::spmv[NATIVE," + Kokkos::ArithTraits::name() + "]"; + Kokkos::Profiling::pushRegion(label); + Impl::SPMV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type*, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type*, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits, + false> + ::spmv (controls, mode, alpha, A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } + else + { + //note: the cuSPARSE spmv wrapper defines a profiling region, so one is not needed here. + Impl::SPMV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type*, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type*, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits> + ::spmv (controls, mode, alpha, A_i, x_i, beta, y_i); + } +} template From 208d8a5676ac839571dd7dde4042201d719791dc Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 19 Jan 2021 12:38:54 -0700 Subject: [PATCH 022/126] update compileKokkosKernels* scripts --- example/buildlib/compileKokkosKernels.sh | 31 ++++++++++++------- .../buildlib/compileKokkosKernelsSimple.sh | 14 ++++++--- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/example/buildlib/compileKokkosKernels.sh b/example/buildlib/compileKokkosKernels.sh index 9f5978bb58..4bf57ce316 100755 --- a/example/buildlib/compileKokkosKernels.sh +++ b/example/buildlib/compileKokkosKernels.sh @@ -1,13 +1,22 @@ -KOKKOS_PATH=${HOME}/work/kokkos #path to kokkos source -KOKKOSKERNELS_SCALARS='double,"complex"' #the scalar types to instantiate =double,float... -KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. -KOKKOSKERNELS_ORDINALS=int,long #ordinal types to instantiate -KOKKOSKERNELS_OFFSETS=int,size_t #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels +#!/bin/bash +# Requires cmake version > 3.12 +# Paths to source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory + +# Compiler - must be passed to kokkos and kokkos-kernels configurations +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #Options: icpc #g++ #clang++ CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # -KOKKOS_DEVICES=Serial,Cuda,OpenMP #devices Cuda... -KOKKOS_ARCHS=Pascal60,Power8 -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +# Configure Kokkos (Unit Tests OFF) - Makefile located in kokkos-build +cmake -Bkokkos-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ARCH_PASCAL60=ON -DKokkos_ARCH_POWER8=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DCMAKE_CXX_FLAGS="${CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PWD}/kokkos-install" -DKokkos_ENABLE_TESTS=OFF ${KOKKOS_PATH} + +# Build and Install Kokkos - install lib at ${PWD}/kokkos-install +cmake --build kokkos-build -j 8 --target install + + +# Configure KokkosKernels (Unit Tests OFF) - Makefile located in kokkoskernels-build +cmake -Bkokkoskernels-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ROOT="${PWD}/kokkos-install" -DKokkosKernels_INST_DOUBLE=ON -DKokkosKernels_INST_COMPLEX_DOUBLE=ON -DKokkosKernels_INST_ORDINAL_INT=ON -DKokkosKernels_INST_ORDINAL_INT64_T=ON -DKokkosKernels_INST_OFFSET_INT=ON -DKokkosKernels_INST_OFFSET_SIZE_T=ON -DKokkosKernels_INST_LAYOUTLEFT=ON -DKokkosKernels_ADD_DEFAULT_ETI=ON -DCMAKE_INSTALL_PREFIX="${PWD}/kokkoskernels-install" -DKokkosKernels_ENABLE_TESTS=OFF -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF ${KOKKOSKERNELS_PATH} + +# Build and Install KokkosKernels - install lib at ${PWD}/kokkoskernels-install +cmake --build kokkoskernels-build -j 8 --target install diff --git a/example/buildlib/compileKokkosKernelsSimple.sh b/example/buildlib/compileKokkosKernelsSimple.sh index 20d0a7aef4..9502235aba 100755 --- a/example/buildlib/compileKokkosKernelsSimple.sh +++ b/example/buildlib/compileKokkosKernelsSimple.sh @@ -1,13 +1,17 @@ -KOKKOS_PATH=${HOME}/proj/kokkos #path to kokkos source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory. + KOKKOSKERNELS_SCALARS=double #the scalar types to instantiate =double,float... KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. KOKKOSKERNELS_ORDINALS=int #ordinal types to instantiate KOKKOSKERNELS_OFFSETS=int #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels -KOKKOS_DEVICES=Cuda # other devices Cuda,Serial .. +KOKKOS_DEVICES=Cuda KOKKOS_ARCHS=SKX,Volta70 +KOKKOS_CUDA_OPTIONS=enable_lambda CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wtype-limits -Wuninitialized" -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +../../cm_generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" + +# Call "../../scripts/cm_generate_makefile.bash --help" for options From d97be703b808fc454b2b18506cf0f4c758653b11 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Thu, 21 Jan 2021 15:22:44 -0700 Subject: [PATCH 023/126] IntelClang guarding __assume_aligned with !defined(__clang__) With the new check we are making sure that the IntelClang compilers that do not define this function are not generating build errors. This fixes an issue in Kokkos Kernels on the iris/yarrow platforms. --- perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp | 2 +- src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp index 285bd1038f..d769d3a4da 100644 --- a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp +++ b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp @@ -138,7 +138,7 @@ void openmp_smart_static_matvec(AType A, XType x, YType y) { #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index 72c8a969fe..7ac4936f51 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -69,7 +69,7 @@ void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatr typename YVector::const_value_type zero = 0; #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif From 6b1c3ecba4f5ebeaeccae7ec59bd4228a86a6916 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Jan 2021 10:43:24 -0700 Subject: [PATCH 024/126] cmake: Add ARMPL TPL support --- BUILD.md | 9 +++++++++ cm_generate_makefile.bash | 2 +- cmake/Dependencies.cmake | 2 +- cmake/KokkosKernels_config.h.in | 2 ++ cmake/Modules/FindTPLARMPL.cmake | 32 ++++++++++++++++++++++++++++++++ cmake/compile_tests/armpl.cpp | 5 +++++ cmake/kokkoskernels_tpls.cmake | 2 ++ 7 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 cmake/Modules/FindTPLARMPL.cmake create mode 100644 cmake/compile_tests/armpl.cpp diff --git a/BUILD.md b/BUILD.md index 19ea0fd573..023cf96f4e 100644 --- a/BUILD.md +++ b/BUILD.md @@ -125,6 +125,12 @@ endif() * CUSPARSE_LIBRARY_DIRS: STRING * Optional override for the library directories that comprise TPL CUSPARSE. * Default: None. Default common library locations will be searched +* ARMPL_LIBRARIES: STRING + * Optional override for the libraries that comprise TPL ARMPL. + * Default: None. Default common library names will be searched +* ARMPL_LIBRARY_DIRS: STRING + * Optional override for the library directories that comprise TPL ARMPL. + * Default: None. Default common library locations will be searched * KokkosKernels_BLAS_ROOT: PATH * Location of BLAS install root. * Default: None or the value of the environment variable BLAS_ROOT if set @@ -161,6 +167,9 @@ endif() * KokkosKernels_ENABLE_TPL_MKL: BOOL * Whether to enable MKL * Default: OFF +* KokkosKernels_ENABLE_TPL_ARMPL: BOOL + * Whether to enable ARMPL + * Default: OFF * KokkosKernels_ETI_ONLY: BOOL * Whether to restrict availability of kernels to ETI types only. Turning this on guarantees that kernels are never built inside of object files which simply call KokkosKernels functions. * Default: OFF diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index 82e854e07d..d8cfa6a18c 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -313,7 +313,7 @@ display_help_text() { echo "--with-tpls=[TPLS]: Set tpls to be instantiated (Proper support requies that appropriate compiler and device must be enabled)." echo " This may require providing paths and the library name if using custom installs not on a default path" echo " that CMake searches" - echo " Options: blas, mkl, cublas, cusparse, magma" + echo " Options: blas, mkl, cublas, cusparse, magma, armpl" echo "--user-blas-path=[PATH]: Set path to location of user-specified BLAS library." echo "--user-blas-lib=[LIB]: Library name of desired BLAS install." echo " Example: For the typical \"libblas.a\" provide \"blas\"" diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 66990dd126..0aa97b1d6c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS ARMPL TEST_OPTIONAL_TPLS yaml-cpp ) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index c0a1e98ec6..c252f15bc0 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -102,6 +102,8 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACKE /* METIS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_METIS +/* ARMPL */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ARMPL #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake new file mode 100644 index 0000000000..4a0f485192 --- /dev/null +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -0,0 +1,32 @@ +IF (ARMPL_LIBRARY_DIRS AND ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES}) +ELSEIF (ARMPL_LIBRARY_DIRS) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES armpl LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (DEFINED ENV{ARMPL_DIR}) + SET(ARMPL_ROOT $ENV{ARMPL_DIR}) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE + LIBRARIES + amath + armpl + LIBRARY_PATHS + ${ARMPL_ROOT}/lib + HEADERS + armpl.h + HEADER_PATHS + ${ARMPL_ROOT}/include + ) +ELSE() + FIND_PACKAGE(ARMPL REQUIRED) + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ARMPL INTERFACE LINK_LIBRARIES ${ARMPL_LIBRARIES}) +ENDIF() + +TRY_COMPILE(KOKKOSKERNELS_TRY_COMPILE_ARMPL + ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests + ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/armpl.cpp + LINK_LIBRARIES -larmpl -lgfortran -lamath -lm + OUTPUT_VARIABLE KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT) +IF(NOT KOKKOSKERNELS_TRY_COMPILE_ARMPL) + MESSAGE(FATAL_ERROR "KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT=${KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT}") +ENDIF() diff --git a/cmake/compile_tests/armpl.cpp b/cmake/compile_tests/armpl.cpp new file mode 100644 index 0000000000..9bb1c48392 --- /dev/null +++ b/cmake/compile_tests/armpl.cpp @@ -0,0 +1,5 @@ +#include + +int main(void) { + return 0; +} diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 08230dd987..3aebad11b5 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -468,6 +468,7 @@ ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CHOLMOD OFF "Whether to enable CHOLMOD") KOKKOSKERNELS_ADD_TPL_OPTION(SUPERLU OFF "Whether to enable SUPERLU") KOKKOSKERNELS_ADD_TPL_OPTION(METIS OFF "Whether to enable METIS") +KOKKOSKERNELS_ADD_TPL_OPTION(ARMPL OFF "Whether to enable ARMPL") # We need to do all the import work IF (NOT KOKKOSKERNELS_HAS_TRILINOS) @@ -481,6 +482,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(CHOLMOD) KOKKOSKERNELS_IMPORT_TPL(SUPERLU) KOKKOSKERNELS_IMPORT_TPL(METIS) + KOKKOSKERNELS_IMPORT_TPL(ARMPL) ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) From a7f14fbac0a0e1b4c04fe0893f8a24c70146d8e6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Jan 2021 13:44:31 -0700 Subject: [PATCH 025/126] scripts: Update cmake to 3.19.3 --- scripts/cm_test_all_sandia | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 54f4bd4621..beb2857abb 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -660,10 +660,10 @@ elif [ "$MACHINE" = "blake" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - module load cmake/3.12.3 + module load cmake/3.19.3 - BASE_MODULE_LIST="cmake/3.12.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.12.3,/compilers/" + BASE_MODULE_LIST="cmake/3.19.3,/" + BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) From c19a915cc9472dde8cdb075d4ad3a86398a43ce6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 27 Jan 2021 15:48:04 -0700 Subject: [PATCH 026/126] scripts: Reduce build list size --- scripts/cm_test_all_sandia | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index beb2857abb..bc55f6cd04 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -677,8 +677,8 @@ elif [ "$MACHINE" = "blake" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" From 51f7193bd79235d98258e81eef39e62f73e2a4f7 Mon Sep 17 00:00:00 2001 From: Luc Date: Thu, 28 Jan 2021 05:28:17 +0000 Subject: [PATCH 027/126] Matrix Converter: fixing issue with deallocation after Kokkos::fininalize() Small change but we no longer get an error while running this. --- test_common/KokkosKernels_MatrixConverter.cpp | 312 +++++++++--------- 1 file changed, 157 insertions(+), 155 deletions(-) diff --git a/test_common/KokkosKernels_MatrixConverter.cpp b/test_common/KokkosKernels_MatrixConverter.cpp index 3f3fe11bae..41fb5ebc2c 100644 --- a/test_common/KokkosKernels_MatrixConverter.cpp +++ b/test_common/KokkosKernels_MatrixConverter.cpp @@ -53,8 +53,6 @@ int main (int argc, char* argv[]){ typedef int size_type; typedef int idx; typedef double wt; - - Kokkos::initialize(argc,argv); bool symmetrize = false, remove_diagonal = false, transpose = false; char *in_mtx = NULL, *out_bin = NULL; @@ -92,204 +90,208 @@ int main (int argc, char* argv[]){ exit(1); } - typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; - typedef typename KokkosSparse::CrsMatrix crstmat_t; - typedef typename crstmat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crstmat_t::values_type::non_const_type values_view_t; + Kokkos::initialize(argc,argv); + { + + typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; + + typedef typename KokkosSparse::CrsMatrix crstmat_t; + typedef typename crstmat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crstmat_t::values_type::non_const_type values_view_t; - typedef typename graph_t::row_map_type::const_type c_row_map_view_t; - typedef typename graph_t::entries_type::const_type c_cols_view_t; - typedef typename crstmat_t::values_type::const_type c_values_view_t; + typedef typename graph_t::row_map_type::const_type c_row_map_view_t; + typedef typename graph_t::entries_type::const_type c_cols_view_t; + typedef typename crstmat_t::values_type::const_type c_values_view_t; - crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); + crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); - c_row_map_view_t orm = a_crsmat.graph.row_map; - c_cols_view_t oentries = a_crsmat.graph.entries; - c_values_view_t ovalues = a_crsmat.values; + c_row_map_view_t orm = a_crsmat.graph.row_map; + c_cols_view_t oentries = a_crsmat.graph.entries; + c_values_view_t ovalues = a_crsmat.values; - const size_type *prm = orm.data(); - const idx *pentries = oentries.data(); - const wt *pvals = ovalues.data(); + const size_type *prm = orm.data(); + const idx *pentries = oentries.data(); + const wt *pvals = ovalues.data(); - idx numrows = a_crsmat.numRows(); - //idx numcols = a_crsmat.numCols(); - idx nnz = ovalues.extent(0); - std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; - //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); + idx numrows = a_crsmat.numRows(); + //idx numcols = a_crsmat.numCols(); + idx nnz = ovalues.extent(0); + std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; + //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); - if (remove_diagonal) { - std::vector nrm(numrows + 1, 0); - std::vector nentries(nnz + 1); - std::vector nvals(nnz + 1); + if (remove_diagonal) { + std::vector nrm(numrows + 1, 0); + std::vector nentries(nnz + 1); + std::vector nvals(nnz + 1); - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - //wt val = pvals[j]; + size_type begin = prm[i]; + size_type end = prm[i+1]; + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + //wt val = pvals[j]; - if (i == col){ - nrm[i] = 1; - break; - } + if (i == col){ + nrm[i] = 1; + break; + } + } } - } - size_type prefix = 0; - for (idx i = 0; i <= numrows; ++i){ - size_type current = nrm[i]; - nrm[i] = prefix; - prefix += current; + size_type prefix = 0; + for (idx i = 0; i <= numrows; ++i){ + size_type current = nrm[i]; + nrm[i] = prefix; + prefix += current; - } + } - for (idx i = 0; i <= numrows; ++i){ - nrm[i] = prm[i] - nrm[i]; - } + for (idx i = 0; i <= numrows; ++i){ + nrm[i] = prm[i] - nrm[i]; + } - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; + size_type begin = prm[i]; + size_type end = prm[i+1]; - size_type obegin = nrm[i]; + size_type obegin = nrm[i]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - wt val = pvals[j]; - if (i != col){ - nentries[obegin] = col; - nvals[obegin++] = val; - } - } - if (obegin != nrm[i+1]){ - std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; - exit(1); + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + wt val = pvals[j]; + if (i != col){ + nentries[obegin] = col; + nvals[obegin++] = val; + } + } + if (obegin != nrm[i+1]){ + std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; + exit(1); + } } - } - row_map_view_t new_rowmap ("new rowmap", numrows + 1); + row_map_view_t new_rowmap ("new rowmap", numrows + 1); - cols_view_t new_entries("new colmap", nrm[numrows]); - values_view_t new_values("new values", nrm[numrows ]); + cols_view_t new_entries("new colmap", nrm[numrows]); + values_view_t new_values("new values", nrm[numrows ]); - for (idx i = 0; i <= numrows; ++i){ - new_rowmap(i) = nrm[i]; - } - - for (size_type i = 0; i < nrm[numrows ]; ++i){ - new_entries(i) = nentries[i]; - new_values(i) = nvals[i]; - } + for (idx i = 0; i <= numrows; ++i){ + new_rowmap(i) = nrm[i]; + } - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); - a_crsmat = transpose_matrix; + for (size_type i = 0; i < nrm[numrows ]; ++i){ + new_entries(i) = nentries[i]; + new_values(i) = nvals[i]; + } + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); + a_crsmat = transpose_matrix; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); - if (symmetrize) { + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - row_map_view_t new_rowmap; - cols_view_t new_entries; + if (symmetrize) { - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - - (numrows, orm, oentries, new_rowmap, new_entries); - values_view_t new_values("new_values",new_entries.extent(0)); + row_map_view_t new_rowmap; + cols_view_t new_entries; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numrows, orm, oentries, new_rowmap, new_entries); + values_view_t new_values("new_values",new_entries.extent(0)); - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); - graph_t symmetric_graph(new_entries, new_rowmap); - crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); - a_crsmat = symmetric_marix; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; + graph_t symmetric_graph(new_entries, new_rowmap); + crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); + a_crsmat = symmetric_marix; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } - if (transpose) { - row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); - cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); - values_view_t new_values ("new_rowmap", a_crsmat.nnz()); - - KokkosKernels::Impl::transpose_matrix< - c_row_map_view_t, c_cols_view_t, c_values_view_t, - row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( - a_crsmat.numRows(), a_crsmat.numCols(), - a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, - new_rowmap, new_entries, new_values); - - std::cout << 1 << std::endl; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); - std::cout << 2 << std::endl; - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; - std::cout << 3 << std::endl; - MyExecSpace().fence(); - KokkosKernels::Impl::kk_print_1Dview(out_adj); - KokkosKernels::Impl::kk_print_1Dview(out_vals); - - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); - a_crsmat = transpose_matrix; - - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); - - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } + if (transpose) { + row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); + cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); + values_view_t new_values ("new_rowmap", a_crsmat.nnz()); + + KokkosKernels::Impl::transpose_matrix< + c_row_map_view_t, c_cols_view_t, c_values_view_t, + row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( + a_crsmat.numRows(), a_crsmat.numCols(), + a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, + new_rowmap, new_entries, new_values); + + std::cout << 1 << std::endl; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); + std::cout << 2 << std::endl; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; + std::cout << 3 << std::endl; + MyExecSpace().fence(); + KokkosKernels::Impl::kk_print_1Dview(out_adj); + KokkosKernels::Impl::kk_print_1Dview(out_vals); + + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); + a_crsmat = transpose_matrix; + + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; + + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + } Kokkos::finalize(); From 2d8f38503d459197cf7d5eae064ed21f68152280 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 28 Jan 2021 10:05:57 -0700 Subject: [PATCH 028/126] scripts: Add blas tpl to env --- scripts/cm_test_all_sandia | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index bc55f6cd04..95c0d6cbd3 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -665,6 +665,8 @@ elif [ "$MACHINE" = "blake" ]; then BASE_MODULE_LIST="cmake/3.19.3,/" BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" + GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" + if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: @@ -678,7 +680,7 @@ elif [ "$MACHINE" = "blake" ]; then # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" From 88d8647d11b5bfa6fe7e2094ca03d979b003205a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 28 Jan 2021 10:19:01 -0700 Subject: [PATCH 029/126] scripts: Reduce build list size --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 95c0d6cbd3..cb554d75ed 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -673,7 +673,7 @@ elif [ "$MACHINE" = "blake" ]; then #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) From d9ff5cfa0b8fa1a84653c6a4da1503d452f68a32 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 28 Jan 2021 11:09:28 -0700 Subject: [PATCH 030/126] Implement PR feedback --- scripts/cm_test_all_sandia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index cb554d75ed..58c265458e 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -1078,7 +1078,7 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = blake* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" # BLAS_LIBRARIES="openblas" From ec41e5e128ba979e3ed5424f5c92f3c1661ed02e Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 1 Feb 2021 11:12:23 -0700 Subject: [PATCH 031/126] Construct SpGEMM C with correct #cols --- src/sparse/KokkosSparse_spgemm.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp index 72b7000401..ef4abfc20b 100644 --- a/src/sparse/KokkosSparse_spgemm.hpp +++ b/src/sparse/KokkosSparse_spgemm.hpp @@ -53,7 +53,6 @@ namespace KokkosSparse { template void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { - using graph_type = typename CMatrix::staticcrsgraph_type; using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; @@ -77,8 +76,7 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, c_nnz_size); } - graph_type graphC(entriesC, row_mapC); - C = CMatrix("matrix", graphC); + C = CMatrix("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, row_mapC, entriesC); } template From b3cdd8251e50519a5e17dae22234809e1936ecec Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 1 Feb 2021 20:30:15 -0700 Subject: [PATCH 032/126] Fixed supernodal SpTRSV build with serial+openmp+cuda --- src/sparse/KokkosSparse_sptrsv_handle.hpp | 4 ++-- src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp index b235743306..7fa8d4e4ff 100644 --- a/src/sparse/KokkosSparse_sptrsv_handle.hpp +++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp @@ -222,11 +222,11 @@ class SPTRSVHandle { using integer_view_t = Kokkos::View; using integer_view_host_t = Kokkos::View; - using workspace_t = typename Kokkos::View; + using workspace_t = typename Kokkos::View>; // using host_crsmat_t = KokkosSparse::CrsMatrix; - using crsmat_t = KokkosSparse::CrsMatrix; + using crsmat_t = KokkosSparse::CrsMatrix, void, size_type>; // using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 271d8b2396..f7f3cc5bf5 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -617,7 +617,7 @@ struct SparseTriSupernodalSpMVFunctor using scalar_t = typename LHSType::non_const_value_type; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; int flag; long node_count; @@ -698,7 +698,7 @@ struct LowerTriSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using range_type = Kokkos::pair; @@ -875,7 +875,7 @@ struct UpperTriSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using SupernodeView = typename Kokkos::View; From b14c6da990d15d0d7e7f6668c45fd1676450cf46 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 2 Feb 2021 14:16:06 -0700 Subject: [PATCH 033/126] Make work_view_t typedef consistent (doesn't affect whether supernodal sptrsv build/tests succeed) --- src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index f7f3cc5bf5..1f8164a3b0 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1028,7 +1028,7 @@ struct UpperTriTranSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using range_type = Kokkos::pair; From 28ca2a932fa19806ca67aecd431d4c724a67303c Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 11:36:38 -0700 Subject: [PATCH 034/126] KokkosSparse - implement serial code path for sierra --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 195 +++++++++++++++++---- 1 file changed, 161 insertions(+), 34 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 7b91f95e09..a79a4090ab 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -324,6 +324,70 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, if (A.numRows () <= static_cast (0)) { return; } + #ifdef KOKKOS_ENABLE_SERIAL + if(std::is_same::value) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef typename AMatrix::size_type size_type; + + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::value_type *__restrict__ y_ptr = y.data(); + typename XVector::value_type *__restrict__ x_ptr = x.data(); + + const value_type one(1), zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + if (beta == zero) { + memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + } else if (beta == one) { + /// do nothing + } else { + for (int i=0;i::value) && @@ -418,45 +482,108 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = A.nnz () / A.numRows (); +#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) + { + int impl_thread_pool_size(0); +#if defined(KOKKOS_ENABLE_SERIAL) + if (std::is_same::value) + impl_thread_pool_size = 1; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); +#endif - int vector_length = 1; - bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); - int max_vector_length = 1; + if (impl_thread_pool_size == 1) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef Kokkos::Details::ArithTraits ATV; + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::value_type *__restrict__ y_ptr = y.data(); + typename XVector::value_type *__restrict__ x_ptr = x.data(); + + const value_type one(1), zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + /// do nothing + } else { + for (int i=0;i(); + int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - max_vector_length = 32; + if(std::is_same::value) + max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if(std::is_same::value) - max_vector_length = 64; + if(std::is_same::value) + max_vector_length = 64; #endif - if(use_teams) { - while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) - vector_length*=2; - } - - typedef SPMV_Transpose_Functor OpType; - - typename AMatrix::const_ordinal_type nrow = A.numRows(); - - OpType op (alpha, A, x, y); - - if(use_teams) { - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > - ( nteams , team_size , vector_length ) , op ); - } - else { - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > - ( 0 , nrow ) , op ); + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } + + typedef SPMV_Transpose_Functor OpType; + + typename AMatrix::const_ordinal_type nrow = A.numRows(); + + OpType op (alpha, A, x, y); + + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > + ( 0 , nrow ) , op ); + } } } From 15bb370a583ae7b3447e74fb0d47058a494d4e04 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 11:47:22 -0700 Subject: [PATCH 035/126] KokkosSparse - remove warning unused variables --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index a79a4090ab..0eceea5b76 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -505,7 +505,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, typename YVector::value_type *__restrict__ y_ptr = y.data(); typename XVector::value_type *__restrict__ x_ptr = x.data(); - const value_type one(1), zero(0); + const value_type zero(0); const ordinal_type nrow = A.numRows(); if (alpha == zero) { /// do nothing From 2c175fe6ee863637f45c123f786f979b82ae8731 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 12:46:07 -0700 Subject: [PATCH 036/126] KokkosSparse - missing return --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 0eceea5b76..2a3f50e2d4 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -493,6 +493,10 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, if (std::is_same::value) impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); #endif +#if defined(KOKKOS_ENABLE_THREADS) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size(); +#endif if (impl_thread_pool_size == 1) { /// serial impl @@ -539,6 +543,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, } } } + return; } } #endif From b99d242c22c4016865ea78f432f686ed091cb321 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 13:04:54 -0700 Subject: [PATCH 037/126] KokkosSparse - non const ordianl --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 2a3f50e2d4..52f9107b26 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -318,24 +318,24 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::execution_space execution_space; if (A.numRows () <= static_cast (0)) { return; } - #ifdef KOKKOS_ENABLE_SERIAL +#if defined(KOKKOS_ENABLE_SERIAL) if(std::is_same::value) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; - typedef typename AMatrix::size_type size_type; + typedef typename AMatrix::non_const_size_type size_type; const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); const value_type *__restrict__ values_ptr = A.values.data(); - typename YVector::value_type *__restrict__ y_ptr = y.data(); - typename XVector::value_type *__restrict__ x_ptr = x.data(); + typename YVector::non_const_value_type *__restrict__ y_ptr = y.data(); + typename XVector::const_value_type *__restrict__ x_ptr = x.data(); const value_type one(1), zero(0); const ordinal_type nrow = A.numRows(); @@ -356,7 +356,7 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, { const int jdist = (jend-jbeg)/4; - typename YVector::value_type tmp1(0), tmp2(0), tmp3(0), tmp4(0); + typename YVector::non_const_value_type tmp1(0), tmp2(0), tmp3(0), tmp4(0); for (int jj=0;jj Date: Tue, 9 Feb 2021 13:44:00 -0700 Subject: [PATCH 038/126] KokkosSparse - alpha is not accounted --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 52f9107b26..6e21eab477 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -324,7 +324,7 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, if (A.numRows () <= static_cast (0)) { return; } -#if defined(KOKKOS_ENABLE_SERIAL) +#if defined(KOKKOS_ENABLE_SERIAL) if(std::is_same::value) { /// serial impl typedef typename AMatrix::non_const_value_type value_type; @@ -381,7 +381,7 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, const int col_idx = col_idx_ptr[j]; tmp1 += value*x_ptr[col_idx]; } - y_ptr[i] = y_ptr[i]*beta + tmp1 + tmp2 + tmp3 + tmp4; + y_ptr[i] = y_ptr[i]*beta + alpha*(tmp1 + tmp2 + tmp3 + tmp4); } } } @@ -482,7 +482,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } -#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) +#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) { int impl_thread_pool_size(0); #if defined(KOKKOS_ENABLE_SERIAL) From 695e5bc06411f61515bba51aa210546115b7694e Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 14:51:21 -0700 Subject: [PATCH 039/126] KokkosSparse - coefficient alpha beta should be declared as yVector type --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 6e21eab477..3aec452958 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -337,7 +337,7 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, typename YVector::non_const_value_type *__restrict__ y_ptr = y.data(); typename XVector::const_value_type *__restrict__ x_ptr = x.data(); - const value_type one(1), zero(0); + const typename YVector::non_const_value_type one(1), zero(0); const ordinal_type nrow = A.numRows(); if (alpha == zero) { if (beta == zero) { @@ -509,7 +509,7 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, typename YVector::value_type *__restrict__ y_ptr = y.data(); typename XVector::value_type *__restrict__ x_ptr = x.data(); - const value_type zero(0); + const typename YVector::non_const_value_type zero(0); const ordinal_type nrow = A.numRows(); if (alpha == zero) { /// do nothing From 28cd041d5723ab3089c8f92e6747c05b3e522c66 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Tue, 9 Feb 2021 16:35:02 -0700 Subject: [PATCH 040/126] KokkosSparse - somehow I need to give a special code path for dobeta --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 25 +++++++++++++++------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 3aec452958..8ae6bd9649 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -340,14 +340,14 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, const typename YVector::non_const_value_type one(1), zero(0); const ordinal_type nrow = A.numRows(); if (alpha == zero) { - if (beta == zero) { - memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); - } else if (beta == one) { - /// do nothing + if (dobeta == 0) { + memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + } else if (dobeta == 1) { + /// so nothing } else { - for (int i=0;i Date: Tue, 9 Feb 2021 17:13:38 -0700 Subject: [PATCH 041/126] KokkosSparse - spot check pass --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 8ae6bd9649..c17a35f40d 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -337,7 +337,7 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, typename YVector::non_const_value_type *__restrict__ y_ptr = y.data(); typename XVector::const_value_type *__restrict__ x_ptr = x.data(); - const typename YVector::non_const_value_type one(1), zero(0); + const typename YVector::non_const_value_type zero(0); const ordinal_type nrow = A.numRows(); if (alpha == zero) { if (dobeta == 0) { From 65473a9b44edd3cd3b38e33bc7d09ae298875e1e Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 10 Feb 2021 23:01:06 -0800 Subject: [PATCH 042/126] Fix view types across ternary operator Instead of using auto, use StridedLayout that is really consistent between the two possible result types. --- test_common/KokkosKernels_TestUtils.hpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index bf86768d16..20a568bbc1 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -121,6 +121,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::View SubviewTypeA; + typedef Kokkos::View SubviewTypeB; typedef Kokkos::Details::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; @@ -130,11 +132,19 @@ namespace Test { void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) { // Give each kokkos thread a vector of A - auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL()); + SubviewTypeA a_vec; + if(A_t) + a_vec = Kokkos::subview(A, Kokkos::ALL(), i); + else + a_vec = Kokkos::subview(A, i, Kokkos::ALL()); // Have all vector lanes perform the dot product Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) { - auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j); + SubviewTypeB b_vec; + if(B_t) + b_vec = Kokkos::subview(B, j, Kokkos::ALL()); + else + b_vec = Kokkos::subview(B, Kokkos::ALL(), j); ScalarC ab = ScalarC(0); for (int k = 0; k < A_cols; k++) { auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); From ac242fb802f64b702fa690e1fed640046b274326 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 10 Feb 2021 21:25:58 -0800 Subject: [PATCH 043/126] Add MKL SpMV wrapper To avoid needing specializations for both LayoutLeft/LayoutRight (no difference for rank-1 X/Y), change GetUnifiedLayout to always give LayoutLeft for contiguous rank-1 views. --- src/impl/KokkosKernels_helpers.hpp | 2 +- .../tpls/KokkosSparse_spmv_tpl_spec_avail.hpp | 25 +++ .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 149 +++++++++++++++++- src/sparse/KokkosSparse_spmv.hpp | 15 +- 4 files changed, 187 insertions(+), 4 deletions(-) diff --git a/src/impl/KokkosKernels_helpers.hpp b/src/impl/KokkosKernels_helpers.hpp index 28fd6fc749..797435c51e 100644 --- a/src/impl/KokkosKernels_helpers.hpp +++ b/src/impl/KokkosKernels_helpers.hpp @@ -54,7 +54,7 @@ template struct GetUnifiedLayout { typedef typename std::conditional< ( (ViewType::rank == 1) && - (std::is_same::value) ) || + (!std::is_same::value) ) || ( (ViewType::rank == 0) ) ,Kokkos::LayoutLeft,typename ViewType::array_layout>::type array_layout; }; diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index 345b0b013c..a6749be8c8 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -116,6 +116,31 @@ struct spmv_tpl_spec_avail= 9.0? #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ +template <> \ +struct spmv_tpl_spec_avail, Kokkos::MemoryTraits, const int, \ + const SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ +}; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +#endif + +#endif + // Specialization struct which defines whether a specialization exists template + +namespace KokkosSparse +{ +namespace Impl +{ + + inline void mkl_safe_call(int errcode) + { + if(errcode != SPARSE_STATUS_SUCCESS) + throw std::runtime_error("MKL returned non-success error code"); + } + + inline sparse_operation_t mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': + return SPARSE_OPERATION_TRANSPOSE; + case 'H': + return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, + int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, + const float* x, float* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_s_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, + const double* x, double* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_d_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_c_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*) Avalues)); + MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex8& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_z_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*) Avalues)); + MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex16& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::cout << "Hello from MKL SpMV wrapper for " << Kokkos::ArithTraits::name() << '\n'; \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) +#endif + +#undef KOKKOSSPARSE_SPMV_MKL +} +} +#endif + #endif // KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index ca83cb217b..aca370e476 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -157,12 +157,23 @@ spmv (KokkosKernels::Experimental::Controls controls, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE //cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only supports the normal (N) mode. + if(std::is_same::value || + std::is_same::value) + { #if (9000 <= CUDA_VERSION) - useFallback = useFallback || (mode[0] != NoTranspose[0]); + useFallback = useFallback || (mode[0] != NoTranspose[0]); #endif #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) - useFallback = useFallback || (mode[0] == Conjugate[0]); + useFallback = useFallback || (mode[0] == Conjugate[0]); +#endif + } #endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if(std::is_same::value) + { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } #endif if(useFallback) From 5b50e7cd704f63086d54ef9c624613e5c9465261 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 11 Feb 2021 10:24:27 -0800 Subject: [PATCH 044/126] Removed debug printout --- src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 2834111c18..f3b8090d3f 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -414,7 +414,6 @@ namespace Impl const XVector& x, \ const coefficient_type& beta, \ const YVector& y) { \ - std::cout << "Hello from MKL SpMV wrapper for " << Kokkos::ArithTraits::name() << '\n'; \ std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ From 04af6c78671bcfd4441a2e3fd868fb8d1d9a6dd3 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 11 Feb 2021 16:31:30 -0800 Subject: [PATCH 045/126] mkl spmv support for version 17 uses an older interface than 18+ --- .../tpls/KokkosSparse_spmv_tpl_spec_decl.hpp | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index f3b8090d3f..cfebc243dc 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -304,6 +304,9 @@ namespace KokkosSparse namespace Impl { +#if (__INTEL_MKL__ > 2017) + //MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() + inline void mkl_safe_call(int errcode) { if(errcode != SPARSE_STATUS_SUCCESS) @@ -421,6 +424,86 @@ namespace Impl Kokkos::Profiling::popRegion(); \ } \ }; +#endif + +#if (__INTEL_MKL__ == 2017) + //MKL 2017: use old interface: mkl_?csrmv + inline char mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return 'N'; + case 'T': + return 'T'; + case 'H': + return 'C'; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + + //void mkl_scsrmv(const char *transa, const MKL_INT *m, const MKL_INT *k, const float *alpha, const char *matdescra, const float *val, const MKL_INT *indx, const MKL_INT *pn trb, const MKL_INT *pntre, const float *x, const float *beta, float *y); + inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y) + { + mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, const double* x, double* y) + { + mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex8* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex8* x_mkl = reinterpret_cast(x); + MKL_Complex8* y_mkl = reinterpret_cast(y); + mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex16* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex16* x_mkl = reinterpret_cast(x); + MKL_Complex16* y_mkl = reinterpret_cast(y); + mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; +#endif #ifdef KOKKOS_ENABLE_SERIAL KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, true) From 92396a43e47e39db36bc33e3806209f38a6eb829 Mon Sep 17 00:00:00 2001 From: Kyungjoo Kim Date: Wed, 17 Feb 2021 13:30:08 -0800 Subject: [PATCH 046/126] KokkosSparse - fix for nightly test failure --- src/sparse/impl/KokkosSparse_spmv_impl.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index c17a35f40d..f06e2fb9d9 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -341,7 +341,10 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, const ordinal_type nrow = A.numRows(); if (alpha == zero) { if (dobeta == 0) { - memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + /// not working with kkosDev2_CUDA110_GCC92_cpp17/ + ///memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + for (int i=0;i Date: Thu, 18 Feb 2021 13:48:56 -0700 Subject: [PATCH 047/126] keep track of nonzero entries in generate_supernodal_graph, also add a few more timers --- src/sparse/KokkosSparse_sptrsv.hpp | 13 ++- src/sparse/KokkosSparse_sptrsv_superlu.hpp | 5 ++ src/sparse/KokkosSparse_sptrsv_supernode.hpp | 59 ++++++++++++- .../impl/KokkosSparse_sptrsv_solve_impl.hpp | 23 ++++- .../KokkosSparse_sptrsv_symbolic_impl.hpp | 84 ++++++++++++------- 5 files changed, 149 insertions(+), 35 deletions(-) diff --git a/src/sparse/KokkosSparse_sptrsv.hpp b/src/sparse/KokkosSparse_sptrsv.hpp index aafd365590..2ac041201e 100644 --- a/src/sparse/KokkosSparse_sptrsv.hpp +++ b/src/sparse/KokkosSparse_sptrsv.hpp @@ -108,11 +108,17 @@ namespace Experimental { Kokkos::MemoryTraits > Entries_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC::sptrsv_symbolic (&tmp_handle, rowmap_i, entries_i); + #ifdef KK_TRISOLVE_TIMERS + std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template > Values_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { RowMap_Internal rowmap_i = rowmap; @@ -189,7 +198,9 @@ namespace Experimental { else { KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries); } - + #ifdef KK_TRISOLVE_TIMERS + std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const input_size_type *nb) { + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + double time_seconds = 0.0; + Kokkos::Timer timer; + #endif using size_type = typename graph_t::size_type; using cols_view_host_t = typename host_graph_t::entries_type::non_const_type; @@ -476,13 +480,19 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu // count non-empty supernodal blocks row_map_view_host_t hr ("rowmap_view", nsuper+1); integer_view_host_t check ("check", nsuper); + integer_view_host_t idxs ("idxs", nsuper); Kokkos::deep_copy (hr, 0); Kokkos::deep_copy (check, 0); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif int nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -493,10 +503,16 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu nblocks ++; // count blocks per row for col_major hr (s2+1) ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } // reset check - Kokkos::deep_copy (check, 0); + //Kokkos::deep_copy (check, 0); + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; + } } cols_view_host_t hc ("colmap_view", nblocks); @@ -506,11 +522,18 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hr (s+1) += hr (s); } } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: count blocks : " << time_seconds << std::endl; + timer.reset (); + #endif nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -525,19 +548,25 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hc (nblocks) = s2; } nblocks ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } if (!col_major) { hr (s+1) = nblocks; } // reset check - if (!col_major) { + /*if (!col_major) { for (size_type s2 = hr(s); s2 < hr(s+1); s2++) { check (hc(s2)) = 0; } } else { // NOTE: nonzero supernodes in s-th col are not stored Kokkos::deep_copy (check, 0); + }*/ + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; } } // fix hr @@ -547,10 +576,21 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu } hr (0) = 0; } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: compress graph : " << time_seconds + << " (col_major = " << col_major << ")" << std::endl; + timer.reset (); + #endif + // sort column ids per row for (int s = 0; s < nsuper; s++) { std::sort(&(hc (hr (s))), &(hc (hr (s+1)))); } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: sort graph : " << time_seconds << std::endl << std::endl; + #endif host_graph_t static_graph (hc, hr); return static_graph; @@ -1018,17 +1058,32 @@ void sptrsv_supernodal_symbolic( // save the supernodal info in the handles for L/U solves handleL->set_supernodes (nsuper, supercols_view, etree); handleU->set_supernodes (nsuper, supercols_view, etree); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Deep-copy graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif if (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG || handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { // generate supernodal graphs for DAG scheduling auto supL = generate_supernodal_graph (!col_majorL, graphL_host, nsuper, supercols); auto supU = generate_supernodal_graph ( col_majorU, graphU_host, nsuper, supercols); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute Supernodal Graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif auto dagL = generate_supernodal_dag (nsuper, supL, supU); auto dagU = generate_supernodal_dag (nsuper, supU, supL); handleL->set_supernodal_dag (dagL); handleU->set_supernodal_dag (dagU); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute DAG Time: " << time_seconds << std::endl; + tic.reset (); + #endif } // =================================================================== diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 1f8164a3b0..0332b82e49 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2656,6 +2656,10 @@ cudaProfilerStop(); size_type node_count = 0; + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { { size_type lvl_nodes = hnodes_per_level(lvl); @@ -2716,7 +2720,6 @@ cudaProfilerStart(); thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) { - //#define profile_supernodal_etree #ifdef profile_supernodal_etree size_t flops = 0; Kokkos::Timer timer; @@ -2884,6 +2887,13 @@ cudaProfilerStop(); } // scope for if-block } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; + #endif } // end lower_tri_solve @@ -2954,6 +2964,10 @@ cudaProfilerStop(); size_type node_count = 0; // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { size_type lvl_nodes = hnodes_per_level(lvl); @@ -3279,6 +3293,13 @@ cudaProfilerStop(); #endif } // end if } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; + std::cout <<" + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + #endif } // end upper_tri_solve diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 19694063f0..45ebfe9e00 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -167,6 +167,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -397,6 +398,23 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con work_offset_host (s) = 0; } } else { + //#define profile_supernodal_etree + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif + /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); integer_view_host_t check ("check", nsuper); @@ -421,22 +439,6 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con size_type num_done = 0; size_type level = 0; - //#define profile_supernodal_etree - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -564,9 +566,15 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con std::cout << " * numer of leaves: min = " << min_nleave << "\t max = " << max_nleave << "\t avg = " << tot_nleave/level << std::endl; std::cout << " * level = " << level << std::endl; #endif + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of level equal to be the number of supernodal columns thandle.set_num_levels (level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -590,6 +598,10 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif + thandle.set_symbolic_complete(); } #endif @@ -604,6 +616,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -826,6 +839,21 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co else { /* schduling from bottom to top (as for L-solve) * * then reverse it for U-solve */ + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); @@ -860,21 +888,6 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co size_type num_done = 0; size_type level = 0; - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -1013,10 +1026,16 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co diag_kernel_type_by_level (level) = 3; } } + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of levels thandle.set_num_levels (num_level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -1039,6 +1058,9 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co Kokkos::deep_copy (dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif thandle.set_symbolic_complete (); } From 2762df8b47ab4621b10b3f91b08b9cb89f7c30ff Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 25 Feb 2021 13:17:44 -0700 Subject: [PATCH 048/126] cmake: Update ArmPL support - If ARMPL is enabled, enable BLAS too - If OpenMP is enabled, add libarmpl_mp to libs --- cmake/KokkosKernels_config.h.in | 5 +++-- cmake/Modules/FindTPLARMPL.cmake | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index c252f15bc0..7cb277baed 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -107,8 +107,9 @@ #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV -/* if MKL, BLAS is also defined */ -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +/* if MKL or ARMPL, BLAS is also defined */ +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\ + defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) #define KOKKOSKERNELS_ENABLE_TPL_BLAS #endif diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake index 4a0f485192..1251197257 100644 --- a/cmake/Modules/FindTPLARMPL.cmake +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -5,11 +5,15 @@ ELSEIF (ARMPL_LIBRARIES) ELSEIF (ARMPL_LIBRARY_DIRS) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES armpl LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) ELSEIF (DEFINED ENV{ARMPL_DIR}) + IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP) + SET(ARMPL_MP armpl_mp) + ENDIF() SET(ARMPL_ROOT $ENV{ARMPL_DIR}) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath armpl + ${ARMPL_MP} LIBRARY_PATHS ${ARMPL_ROOT}/lib HEADERS From cd39797c840c366a52e4ea89b846dd83e6bf199b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 25 Feb 2021 16:42:27 -0700 Subject: [PATCH 049/126] cmake: Ensure F77_BLAS_MANGLE is set and checked --- cmake/kokkoskernels_features.cmake | 2 +- cmake/kokkoskernels_tpls.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index 2212332b7d..6f4561f664 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -24,7 +24,7 @@ KOKKOSKERNELS_FEATURE_DEPENDS_ON_TPLS( # Fortran Complex BLAS # ================================================================== -IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) +IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 3aebad11b5..f5b5eb31b0 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -424,7 +424,7 @@ KOKKOSKERNELS_ADD_TPL_OPTION(LAPACKE OFF "Whether to enable LAPACKE") # Set F77_BLAS_MANGLE macro based on Fortran-C interface (unless already set # by Trilinos or user) IF ("${F77_BLAS_MANGLE}" STREQUAL "") - IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA) + IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) ENABLE_LANGUAGE(C) ENABLE_LANGUAGE(Fortran) INCLUDE(FortranCInterface) From a3e71ee90ac076eb16f0277fee69394b1ea2787c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 1 Mar 2021 10:44:31 -0700 Subject: [PATCH 050/126] cmake: Move ARMPL option. - Since the user-facing options use -DKokkosKernels_ENABLE, but cmake reads KOKKOSKERNELS_ENABLE, we must ensure that the option for ARMPL is added before the F77_BLAS_MANGLE check so that fake_tribits can copy KokkosKernels_ENABLE_TPL_ARMPL to KOKKOSKERNELS_ENABLE_TPL_ARMPL. --- cmake/kokkoskernels_tpls.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index f5b5eb31b0..2bdcda1e81 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -420,6 +420,7 @@ KOKKOSKERNELS_ADD_TPL_OPTION(MKL OFF "Whether to enable MKL") KOKKOSKERNELS_ADD_TPL_OPTION(MAGMA OFF "Whether to enable MAGMA") KOKKOSKERNELS_ADD_TPL_OPTION(CBLAS OFF "Whether to enable CBLAS") KOKKOSKERNELS_ADD_TPL_OPTION(LAPACKE OFF "Whether to enable LAPACKE") +KOKKOSKERNELS_ADD_TPL_OPTION(ARMPL OFF "Whether to enable ARMPL") # Set F77_BLAS_MANGLE macro based on Fortran-C interface (unless already set # by Trilinos or user) @@ -468,7 +469,6 @@ ENDIF() KOKKOSKERNELS_ADD_TPL_OPTION(CHOLMOD OFF "Whether to enable CHOLMOD") KOKKOSKERNELS_ADD_TPL_OPTION(SUPERLU OFF "Whether to enable SUPERLU") KOKKOSKERNELS_ADD_TPL_OPTION(METIS OFF "Whether to enable METIS") -KOKKOSKERNELS_ADD_TPL_OPTION(ARMPL OFF "Whether to enable ARMPL") # We need to do all the import work IF (NOT KOKKOSKERNELS_HAS_TRILINOS) From a9da68958b5cc005a2b1d38d082d0c858968658c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 1 Mar 2021 10:56:22 -0700 Subject: [PATCH 051/126] cmake: Fix ARMPL libraries --- cmake/Modules/FindTPLARMPL.cmake | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake index 1251197257..9375dfa2f0 100644 --- a/cmake/Modules/FindTPLARMPL.cmake +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -1,19 +1,24 @@ +# Both the armpl_mp and armpl libraries define the same public symbol names. +# In order to link against the openmp armpl symbols, instruct cmake to link against armpl_mp. +# In order to link against the default armpl symbols, instruct cmake to link against armpl. +IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP) + SET(ARMPL_LIB armpl_mp) +ELSE() + SET(ARMPL_LIB armpl) +ENDIF() + IF (ARMPL_LIBRARY_DIRS AND ARMPL_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) ELSEIF (ARMPL_LIBRARIES) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES}) ELSEIF (ARMPL_LIBRARY_DIRS) - KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES armpl LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) ELSEIF (DEFINED ENV{ARMPL_DIR}) - IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP) - SET(ARMPL_MP armpl_mp) - ENDIF() SET(ARMPL_ROOT $ENV{ARMPL_DIR}) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath - armpl - ${ARMPL_MP} + ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_ROOT}/lib HEADERS From 28644d2ac7ef634d9b4cffe288e487260d5b6559 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 1 Mar 2021 12:21:15 -0700 Subject: [PATCH 052/126] cmake/Modules: Set BLAS_LIBRARIES in FindTPLARMPL.cmake --- cmake/Modules/FindTPLARMPL.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake index 9375dfa2f0..2ff0efdfbb 100644 --- a/cmake/Modules/FindTPLARMPL.cmake +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -34,8 +34,14 @@ ENDIF() TRY_COMPILE(KOKKOSKERNELS_TRY_COMPILE_ARMPL ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/armpl.cpp - LINK_LIBRARIES -larmpl -lgfortran -lamath -lm + LINK_LIBRARIES -l${ARMPL_LIB} -lgfortran -lamath -lm OUTPUT_VARIABLE KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT) IF(NOT KOKKOSKERNELS_TRY_COMPILE_ARMPL) MESSAGE(FATAL_ERROR "KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT=${KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT}") +ELSE() + # Check with Jeremy. It looks like the defacto standard is to + # have the user set BLAS_LIBRARIES. However, we've done the work + # of finding the armpl blas lib here, so let's go ahead and set it + # in BLAS_LIBRARIES. + SET(BLAS_LIBRARIES "${ARMPL_LIB};gfortran;amath;m") ENDIF() From 8afdc60fb6bc439df30197d9132e01d7bb583fa1 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 1 Mar 2021 15:55:49 -0700 Subject: [PATCH 053/126] cmake: Update ARMPL properties - Ensure that libgfortran and libm are pulled in when linking against KokkosKernels::ARMPL. - Update CheckHostBlasReturnComplex to use properties of KokkosKernels::ARMPL. src: Link kokkoskernels against KokkosKernels::ARMPL. --- CheckHostBlasReturnComplex.cmake | 7 ++++++- cmake/Modules/FindTPLARMPL.cmake | 10 +++++----- src/CMakeLists.txt | 1 + 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 78ae33515b..30063b1cc3 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -5,7 +5,12 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) IF (KOKKOSKERNELS_HAS_TRILINOS) SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES}) ELSE() - SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + # For TPLs, just pull out the required libraries from the target properies. + IF (KOKKOSKERNELS_ENABLE_TPL_ARMPL) + GET_TARGET_PROPERTY(CMAKE_REQUIRED_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + ELSE() + SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + ENDIF() ENDIF() SET(SOURCE diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake index 2ff0efdfbb..62e1e33ea3 100644 --- a/cmake/Modules/FindTPLARMPL.cmake +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -39,9 +39,9 @@ TRY_COMPILE(KOKKOSKERNELS_TRY_COMPILE_ARMPL IF(NOT KOKKOSKERNELS_TRY_COMPILE_ARMPL) MESSAGE(FATAL_ERROR "KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT=${KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT}") ELSE() - # Check with Jeremy. It looks like the defacto standard is to - # have the user set BLAS_LIBRARIES. However, we've done the work - # of finding the armpl blas lib here, so let's go ahead and set it - # in BLAS_LIBRARIES. - SET(BLAS_LIBRARIES "${ARMPL_LIB};gfortran;amath;m") + # KokkosKernels::ARMPL is an alias to the ARMPL target. + # Let's add in the libgfortran and libm dependencies for users here. + GET_TARGET_PROPERTY(ARMPL_INTERFACE_LINK_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + SET(ARMPL_INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES};-lgfortran;-lm") + SET_TARGET_PROPERTIES(ARMPL PROPERTIES INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES}") ENDIF() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 22c17b5247..57b5394107 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -437,4 +437,5 @@ KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) +KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) # Not yet here KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA) From b1d26ed2e98f1dd33f5c8584c41ec3a405499b3d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Feb 2021 14:20:42 -0700 Subject: [PATCH 054/126] perf_test/blas: - Add GFLOP/s output - Add support for separate batch_size option - Update step option to add step size --- perf_test/blas/blas/KokkosBlas_common.hpp | 3 +- perf_test/blas/blas/KokkosBlas_perf_test.cpp | 17 +- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 129 ++++++++++----- .../blas/blas3/KokkosBlas3_perf_test.cpp | 2 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 150 ++++++++++++------ 5 files changed, 210 insertions(+), 91 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_common.hpp b/perf_test/blas/blas/KokkosBlas_common.hpp index a6f9c65d8b..54e79647bf 100644 --- a/perf_test/blas/blas/KokkosBlas_common.hpp +++ b/perf_test/blas/blas/KokkosBlas_common.hpp @@ -56,6 +56,7 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 10 #define DEFAULT_OUT &std::cout #define DEFAULT_BLAS_ROUTINES "trtri," @@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; diff --git a/perf_test/blas/blas/KokkosBlas_perf_test.cpp b/perf_test/blas/blas/KokkosBlas_perf_test.cpp index 46e89d5abb..803286f266 100644 --- a/perf_test/blas/blas/KokkosBlas_perf_test.cpp +++ b/perf_test/blas/blas/KokkosBlas_perf_test.cpp @@ -57,6 +57,7 @@ static struct option long_options[] = { {"matrix_size_step", required_argument, 0, 's'}, {"warm_up_loop", required_argument, 0, 'w'}, {"iter", required_argument, 0, 'i'}, + {"batch_size", required_argument, 0, 'k'}, {"csv", required_argument, 0, 'c'}, {"routines", required_argument, 0, 'r'}, {"trtri_options", required_argument, 0, 'o'}, @@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() { "(default: %d)\n\n", DEFAULT_N); + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A and B.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_K); + printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( @@ -166,12 +172,16 @@ int main(int argc, char **argv) { /* set default options */ options.test = DEFAULT_TEST; options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; options.start.a.m = DEFAULT_MATRIX_START; options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; options.stop.a.m = DEFAULT_MATRIX_STOP; options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; options.start.b.m = DEFAULT_MATRIX_START; options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; options.stop.b.m = DEFAULT_MATRIX_STOP; options.stop.b.n = DEFAULT_MATRIX_STOP; options.step = DEFAULT_STEP; @@ -182,7 +192,7 @@ int main(int argc, char **argv) { options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options, + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas_perf_test(); return 0; @@ -255,6 +265,11 @@ int main(int argc, char **argv) { case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.stop.a.k = + options.start.b.k = options.stop.b.k = + atoi(optarg); + break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6b7b825a7..34c0237871 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -78,6 +78,21 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_TRTRI_ARGS "UU" + /** + * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks + * of the A matrix. a_m subblocks are selected. + */ +static inline int trtri_flop_count(int a_m, int a_n) { + int flop_count = 0; + + for (int i = 0; i < a_m; i++) { + flop_count++; // 1 / A[i,j] + flop_count += (i * (i + 1)); // TRMM FLOPS + flop_count += i; // SCAL FLOPS + } + return flop_count; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -87,18 +102,25 @@ struct trtri_args { typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = - "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s)"; + "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," + "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { + double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trtri.trtri_args << "," - << loop_e_str[options.loop] << "," << trtri_args.A.extent(1) + << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1) << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," + << gflops << "," + << gflops / average_time + << std::endl; } static void __print_trtri_perf_test_options(options_t options) { @@ -133,19 +155,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -164,19 +193,26 @@ void __do_trtri_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trtri::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -241,16 +277,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBlasTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -287,16 +329,23 @@ void __do_trtri_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_batched_trtri_functor); - Kokkos::fence(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); return; @@ -345,7 +394,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0]; trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1]; - trtri_args.A = vta("trtri_args.A", options.n, dim.a.m, dim.a.n); + trtri_args.A = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n); host_A = Kokkos::create_mirror_view(trtri_args.A); Kokkos::fill_random(trtri_args.A, rand_pool, @@ -355,7 +404,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -367,7 +416,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -378,7 +427,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trtri_args.diag == 'U' || trtri_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -408,8 +457,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trtri_args = __do_setup( options, cur_dims); fn(options, trtri_args); diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index b493c244d8..6c95960e25 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -122,7 +122,7 @@ static void __print_help_blas3_perf_test() { printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", - DEFAULT_VECTOR_LEN); + DEFAULT_K); printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 70f7664679..79b58dc7d8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -72,6 +72,26 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 +/** + * The KokkosBatched::SerialTrmm implementation performs dot products on + * non-zero elements of the triangular matrices. The flop calculation below + * assumes KokkosBatched::SerialTrmm is being used. Since the dot products + * do a multiply and add we can calculate the flops for any element in the last + * column of the LHS to be 2*columns_LHS, any element in the last-1 column of + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS + * giving us this flop count: + * flops = columns_LHS * (columns_LHS + 1) + * flops = (flops / 2) * 2 + * flops = flops * rows_LHS + */ +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + if (side == 'L' || side == 'l') { + return (a_n * (a_n + 1)) * a_m; + } else { + return (b_n * (b_n + 1)) * b_m; + } +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -83,19 +103,28 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { + double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, + trmm_args.B.extent(1), trmm_args.B.extent(2), + trmm_args.A.extent(1), trmm_args.A.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(1) + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) + << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << time_in_seconds / options.n << "," + << gflops << "," + << gflops / average_time + << std::endl; } static void __print_trmm_perf_test_options(options_t options) { @@ -131,24 +160,30 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -167,21 +202,28 @@ void __do_trmm_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trmm::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -316,16 +358,22 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for("parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -368,16 +416,22 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); return; @@ -498,8 +552,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.uplo = options.blas_args.trmm.trmm_args.c_str()[1]; trmm_args.trans = options.blas_args.trmm.trmm_args.c_str()[2]; trmm_args.diag = options.blas_args.trmm.trmm_args.c_str()[3]; - trmm_args.A = vta("trmm_args.A", options.n, dim.a.m, dim.a.n); - trmm_args.B = vtb("trmm_args.B", options.n, dim.b.m, dim.b.n); + trmm_args.A = vta("trmm_args.A", dim.a.k, dim.a.m, dim.a.n); + trmm_args.B = vtb("trmm_args.B", dim.b.k, dim.b.m, dim.b.n); trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); @@ -510,7 +564,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -522,7 +576,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -533,7 +587,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trmm_args.diag == 'U' || trmm_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -566,8 +620,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trmm_args = __do_setup( options, cur_dims); From 3211987c766583f587d3ad9bffca32e3e59d3d18 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 08:41:57 -0700 Subject: [PATCH 055/126] perf_test: Account for complex flop counts --- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 21 ++++++++++++++++--- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 17 +++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 34c0237871..3cacc73739 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -84,12 +84,27 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { */ static inline int trtri_flop_count(int a_m, int a_n) { int flop_count = 0; + int flops_per_div, flops_per_mul, flops_per_add; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_div = 1; + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide. + flops_per_div = 6; + flops_per_mul = 6; + flops_per_add = 2; + } for (int i = 0; i < a_m; i++) { - flop_count++; // 1 / A[i,j] - flop_count += (i * (i + 1)); // TRMM FLOPS - flop_count += i; // SCAL FLOPS + flop_count += flops_per_div; // 1 / A[i,j] + flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS + flop_count += i * flops_per_mul; // SCAL FLOPS } + return flop_count; } diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 79b58dc7d8..077c5b3d80 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -85,11 +85,23 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * flops = flops * rows_LHS */ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + int flops; + if (side == 'L' || side == 'l') { - return (a_n * (a_n + 1)) * a_m; + flops = (a_n * (a_n + 1)) * a_m; } else { - return (b_n * (b_n + 1)) * b_m; + flops = (b_n * (b_n + 1)) * b_m; } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return flops * 4; } using view_type_3d = @@ -348,6 +360,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { +// TODO: Note why this is disabled on CUDA and HIP #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; From 667fee39d51110ee2bcd3b7e1216f0d91eac9685 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:08:10 -0700 Subject: [PATCH 056/126] perf_test: Use flop counts from lapack note 41 --- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 3cacc73739..de24a96254 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int trtri_flop_count(int a_m, int a_n) { +static inline int trtri_impl_flop_count(int a_m, int a_n) { int flop_count = 0; int flops_per_div, flops_per_mul, flops_per_add; @@ -108,6 +108,34 @@ static inline int trtri_flop_count(int a_m, int a_n) { return flop_count; } +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trtri_flop_count(int a_m, int a_n) { + int flops; + int flops_per_mul; + int flops_per_add; + + if (a_m != a_n) { + fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); + exit(255); + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + flops_per_mul = 6; + flops_per_add = 2; + } + + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; + + return flops; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -118,7 +146,7 @@ typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; + "total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, @@ -133,7 +161,7 @@ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << average_time << "," - << gflops << "," + << flops << "," << gflops / average_time << std::endl; } From 973afc564f7ba479731773856b9a93f4ddcda647 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:08:18 -0700 Subject: [PATCH 057/126] perf_test: Update flop counts - Use flop counts from lapack note 41 - Fix impl flop counts for side == left --- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 077c5b3d80..a35caad5dd 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -84,11 +84,11 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * flops = (flops / 2) * 2 * flops = flops * rows_LHS */ -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; if (side == 'L' || side == 'l') { - flops = (a_n * (a_n + 1)) * a_m; + flops = (b_m * (b_m + 1)) * b_n; } else { flops = (b_n * (b_n + 1)) * b_m; } @@ -104,6 +104,27 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) return flops * 4; } +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + int flops; + + if (side == 'L' || side == 'l') { + flops = b_m * b_m * b_n; + } else { + flops = b_n * b_n * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return flops * 4; +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -115,7 +136,7 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, @@ -134,7 +155,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << time_in_seconds / options.n << "," - << gflops << "," + << flops << "," << gflops / average_time << std::endl; } From 8d2868740c2dd960f7bddaac061d3bd5edfd61a9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:27:36 -0700 Subject: [PATCH 058/126] perf_test: Update gemm to optionally use RangePolicy --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f26fbb7287..b66f4c3bd0 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,6 +58,7 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG +#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); @@ -322,6 +323,24 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { return; } +template +struct parallel_batched_gemm_range_policy { + gemm_args_t gemm_args_; + + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } +}; + template struct parallel_batched_gemm { @@ -375,36 +394,59 @@ template ; + using functor_type = + parallel_batched_gemm_range_policy; +#else using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = parallel_batched_gemm; +#endif uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; +#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) auto league_size = options.start.c.k; +#endif Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); +#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) auto team_size = gemm_args.bp.team_size; auto vector_len = gemm_args.bp.vector_len; +#endif for (uint32_t i = 0; i < warm_up_n; i++) { +#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); +#else Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", policy_type(league_size, team_size, vector_len), parallel_batched_gemm_functor); +#endif + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; i++) { +#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); +#else Kokkos::parallel_for("parallelBatchedTimedLoopGemm", policy_type(league_size, team_size, vector_len), parallel_batched_gemm_functor); +#endif + Kokkos::fence(); } - Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); From ccbbad3546c4cad67ba1419b35ab9b02f95b1043 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:39:54 -0700 Subject: [PATCH 059/126] perf_test: Update GEMM to output GFLOPs --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b66f4c3bd0..9792c3a061 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -128,15 +128,24 @@ typedef struct gemm_args gemm_args_t; static std::string gemm_csv_header_str = "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" "dims,C_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { + return 2 * a_m * b_k * a_n; +} static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; if (experiment_name) algo_name = std::string(experiment_name); + double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), + gemm_args.B.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size @@ -147,7 +156,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) << "," << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," << time_in_seconds / options.n + << time_in_seconds << "," + << time_in_seconds / options.n << "," + << flops << "," + << gflops / average_time << std::endl; } From 274e9289af0838c1b675eb0ec0bc31d2c506fc62 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:44:22 -0700 Subject: [PATCH 060/126] perf_test: Update gemm size step --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 9792c3a061..0f1b7f70b6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -959,9 +959,9 @@ void __do_loop_and_invoke(options_t options, cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, - cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step, + cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); fn(options, gemm_args); From 6f4e05bd022a61c1b2c71b8f822fdb2a4165aab1 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Feb 2021 11:42:41 -0700 Subject: [PATCH 061/126] perf_test: Disable KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 0f1b7f70b6..59f5a84803 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,7 +58,7 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG -#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY +//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); From b066fa9b2a3170e62e33cc686a386749089942b3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Feb 2021 14:32:06 -0700 Subject: [PATCH 062/126] perf_test/blas: Fix GFLOP calculation --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 4 ++-- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index de24a96254..32626cfba5 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -129,7 +129,7 @@ static inline int trtri_flop_count(int a_m, int a_n) { flops_per_mul = 6; flops_per_add = 2; } - + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; @@ -152,7 +152,7 @@ static std::string trtri_csv_header_str = static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << test_e_str[options.test] << "," diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 59f5a84803..29fcace727 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -143,7 +143,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), gemm_args.B.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index a35caad5dd..9a7f7cc480 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -144,7 +144,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), trmm_args.B.extent(2), trmm_args.A.extent(1), trmm_args.A.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << test_e_str[options.test] << "," From 63382d3722d2868258cefffd4b5639a475d35198 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 12 Feb 2021 20:38:12 -0700 Subject: [PATCH 063/126] perf_test/blas/blas3: Add bandwidth metric to trmm --- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 9a7f7cc480..a313eabbaf 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -136,7 +136,7 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, @@ -146,6 +146,23 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, trmm_args.A.extent(1), trmm_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + double min_memory_transactions, max_memory_transactions; + + // Assuming infinite cache size + // We have to read A and B into the cache once and then write + // B back out to main memory once. + min_memory_transactions = 3; + + // Assuming no register or real caching + // We have to go out to memory for every element we read from A and B as well as + // every element we write to B. + // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the + // write to B since this flop count is for one multiply and one add. + if (trmm_args.side == 'l' || trmm_args.side == 'L') + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.); + else + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.); options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," @@ -154,9 +171,11 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << "," + << average_time << "," << flops << "," - << gflops / average_time + << gflops / average_time << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time << "," + << (gbytes_in_matrix * max_memory_transactions) / average_time << std::endl; } From 898794eb3860f897a44a03b77211d06d4d74809a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 15 Feb 2021 16:20:06 -0700 Subject: [PATCH 064/126] perf_test: Handle complex numbers in flop count --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 29fcace727..d6572dfd34 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -133,7 +133,13 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { - return 2 * a_m * b_k * a_n; + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_k * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return (2 + 6) * a_m * b_k * a_n; } static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, From f11f9138e9c69a6387b9e8c67c0809d81be7f872 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 15 Feb 2021 16:35:38 -0700 Subject: [PATCH 065/126] perf_test/blas/blas3: Gemm perf_test_updates - Fix batched_serial to use RangePolicy instead of TeamPolicy - Add --use_auto option and optionally use Kokkos::AUTO for team_size and vector_len in gemm. --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 2 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++------ .../blas/blas3/KokkosBlas3_perf_test.cpp | 9 +- 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 4952a8e606..01e368e15c 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -61,6 +61,7 @@ #define DEFAULT_BLAS_ROUTINES "trmm,gemm," #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 +#define DEFAULT_USE_AUTO 0 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -83,6 +84,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; + bool use_auto; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d6572dfd34..b4d55d0e90 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,7 +58,6 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG -//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); @@ -409,60 +408,32 @@ struct parallel_batched_gemm { template -void __do_gemm_parallel_batched_template(options_t options, - gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - printf("Using RangePolicy!\n"); using policy_type = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; -#else - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - using functor_type = - parallel_batched_gemm; -#endif uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; -#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - auto league_size = options.start.c.k; -#endif Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); -#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; -#endif for (uint32_t i = 0; i < warm_up_n; i++) { -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", policy_type(0, options.start.c.k), parallel_batched_gemm_functor); -#else - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); -#endif Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) Kokkos::parallel_for("parallelBatchedTimedLoopGemm", policy_type(0, options.start.c.k), parallel_batched_gemm_functor); -#else - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); -#endif Kokkos::fence(); } @@ -471,6 +442,68 @@ void __do_gemm_parallel_batched_template(options_t options, return; } +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + + if (std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + } + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + if (options.blas_args.use_auto) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 6c95960e25..0f1f2b5d07 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -119,6 +119,11 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); + printf("\t-u, --use_auto={0,1}\n"); + printf("\t\tWhether to use Kokkos::AUTO for vector_len and team_size (Heirarchical parallelism).\n"); + printf("\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size will be used. (default: %d)\n", + DEFAULT_USE_AUTO); + printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", @@ -238,6 +243,7 @@ int main(int argc, char **argv) { options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); options.blas_args.team_size = DEFAULT_TEAM_SIZE; options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.blas_args.use_auto = DEFAULT_USE_AUTO; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; @@ -245,7 +251,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -363,6 +369,7 @@ int main(int argc, char **argv) { break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; + case 'u': options.blas_args.use_auto = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); From fb41b4c01582cfab5c88cb39261660030af260c2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 Feb 2021 10:11:21 -0700 Subject: [PATCH 066/126] perf_test/blas/blas3: - Initialize options.blas_args.gemm.beta. - rename --gemm_alpha to --gemm_scalars and accept beta input arg. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 3 ++- .../blas/blas3/KokkosBlas3_perf_test.cpp | 22 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b4d55d0e90..06d854bc2a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -105,6 +105,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; @@ -963,7 +964,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); gemm_args.alpha = options.blas_args.gemm.alpha; - gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 0f1f2b5d07..0ec88f42f7 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -63,7 +63,7 @@ static struct option long_options[] = { {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, {"gemm_options", required_argument, 0, 'g'}, - {"gemm_alpha", required_argument, 0, 'p'}, + {"gemm_scalars", required_argument, 0, 'p'}, {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, @@ -104,10 +104,10 @@ static void __print_help_blas3_perf_test() { "%s)\n", DEFAULT_GEMM_ARGS); - printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); - printf("\t\tGEMM alpha value.\n"); - printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", - DEFAULT_GEMM_ALPHA); + printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n"); + printf("\t\tGEMM alpha and beta values.\n"); + printf("\t\t\tThe value of alpha and beta in floating point. (default: %lf,%lf)\n", + DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA); printf("\t-z, --team_size=SIZE\n"); printf("\t\tKokkos team size.\n"); @@ -250,8 +250,9 @@ int main(int argc, char **argv) { options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -275,14 +276,19 @@ int main(int argc, char **argv) { break; case 'g': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (strlen(optarg) != 3) { + if (strlen(optarg) != 2) { __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.gemm.gemm_args = optarg; break; case 'p': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + double alpha, beta; + if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) + __blas3_perf_test_input_error(argv, ret, optarg); + + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); From a91bd6c9d26f7c8dbddae2b06150f2c3f4bad579 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 Feb 2021 10:53:56 -0700 Subject: [PATCH 067/126] perf_test/blas/blas3: Update csv row for --use_auto --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 06d854bc2a..a5dcbbfb0f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -145,7 +145,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); if (experiment_name) algo_name = std::string(experiment_name); + if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), gemm_args.B.extent(2)); @@ -154,8 +157,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size - << "," << gemm_args.bp.vector_len << "," + << options.blas_args.gemm.beta << "," + << ts << "," + << vlen << "," << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) From 0d4fe93f72950903c138438738c0b1b2789679dd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Feb 2021 13:25:11 -0700 Subject: [PATCH 068/126] perf_test/blas/blas3: Add -d option for view allocation --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 3 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 85 ++++++++++++++----- .../blas/blas3/KokkosBlas3_perf_test.cpp | 9 +- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 01e368e15c..a2c1e6f6ae 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -62,6 +62,7 @@ #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 #define DEFAULT_USE_AUTO 0 +#define DEFAULT_BATCH_SIZE_LAST_DIM 0 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -84,7 +85,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; - bool use_auto; + bool use_auto, batch_size_last_dim; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index a5dcbbfb0f..7e86d04a4f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -82,6 +82,8 @@ struct TeamVectorTag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; +struct LastDimTag {}; +struct FirstDimTag {}; // gemm invoke table void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { @@ -150,11 +152,20 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; - double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), - gemm_args.B.extent(2)); - double gflops = flops / 1e9; + double flops; + double gflops; double average_time = time_in_seconds / options.n; + if (options.blas_args.batch_size_last_dim) { + flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.B.extent(1)); + } else { + flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), + gemm_args.B.extent(2)); + } + + gflops = flops / 1e9; + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," << options.blas_args.gemm.beta << "," @@ -353,7 +364,7 @@ struct parallel_batched_gemm_range_policy { parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { + void operator()(const FirstDimTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); @@ -361,6 +372,16 @@ struct parallel_batched_gemm_range_policy { KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const LastDimTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } }; template void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::RangePolicy; + using policy_type = Kokkos::RangePolicy; + using policy_type_last_dim = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; @@ -427,19 +449,38 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar functor_type parallel_batched_gemm_functor(gemm_args); - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); + if (options.blas_args.batch_size_last_dim) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type_last_dim(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } } - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); + if (options.blas_args.batch_size_last_dim) { + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type_last_dim(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } } __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -964,9 +1005,15 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k); + gemm_args.B = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k); + gemm_args.C = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); + gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); + gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 0ec88f42f7..72a92a32b1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -67,6 +67,7 @@ static struct option long_options[] = { {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, + {"batch_size_last_dim", required_argument, 0, 'd'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -129,6 +130,11 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); + printf("\t-d, --batch_size_last_dim={0,1}\n"); + printf("\t\tHow to allocate the batch_size in the matrices.\n"); + printf("\t\t\t1 make the batch_size the last dimension, otherwise batch_size is the first dimension (default: %d)\n", + DEFAULT_BATCH_SIZE_LAST_DIM); + printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); printf("\t\t\tValid values for OPTION:\n"); @@ -252,7 +258,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -373,6 +379,7 @@ int main(int argc, char **argv) { options.stop.a.k = options.stop.b.k = options.stop.c.k = atoi(optarg); break; + case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'u': options.blas_args.use_auto = atoi(optarg); break; From 5c729bc9243b903059c2abd1422519079e655d07 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Feb 2021 13:55:15 -0700 Subject: [PATCH 069/126] perf_test/blas/blas3: Update team and team_vector for -d --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 159 ++++++++++++------ 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 7e86d04a4f..3db8f0dc1a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -77,13 +77,14 @@ void do_gemm_team_vector_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; +struct SerialBatchDim3Tag {}; struct TeamTag {}; +struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; +struct TeamVectorBatchDim3Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; -struct LastDimTag {}; -struct FirstDimTag {}; // gemm invoke table void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { @@ -364,7 +365,7 @@ struct parallel_batched_gemm_range_policy { parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const FirstDimTag &, const int &i) const { + void operator()(const SerialTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); @@ -374,7 +375,7 @@ struct parallel_batched_gemm_range_policy { } KOKKOS_INLINE_FUNCTION - void operator()(const LastDimTag &, const int &i) const { + void operator()(const SerialBatchDim3Tag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); @@ -382,6 +383,15 @@ struct parallel_batched_gemm_range_policy { KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {} }; template ::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { auto i = member.league_rank(); @@ -414,6 +435,18 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const MemberType &member) const { auto team_idx = member.league_rank(); @@ -430,14 +463,30 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svB = + Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svC = + Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), team_idx); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } }; template void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::RangePolicy; - using policy_type_last_dim = Kokkos::RangePolicy; + using policy_type = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; @@ -449,38 +498,19 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar functor_type parallel_batched_gemm_functor(gemm_args); - if (options.blas_args.batch_size_last_dim) { - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type_last_dim(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } - } else { - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); } - if (options.blas_args.batch_size_last_dim) { - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type_last_dim(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } - } else { - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); } __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -503,8 +533,8 @@ void __do_gemm_parallel_batched_template(options_t options, auto league_size = options.start.c.k; Kokkos::Timer timer; - if (std::is_same::value) { - return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + if (std::is_same::value || std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); } STATUS; @@ -1089,41 +1119,66 @@ void do_gemm_serial_batched_blocked(options_t options) { void do_gemm_serial_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } From 6da5a7b637552ba325a1d42c0561cf8b294b362a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 18 Feb 2021 13:35:09 -0700 Subject: [PATCH 070/126] perf_test/blas/blas3: Add simd gemm as experiment6. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 98 ++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 3db8f0dc1a..f24a1091b7 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1023,6 +1023,99 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment6 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment6(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::TeamVectorGemm::invoke( + member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + } +}; + +template +void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using vector_type = KokkosBatched::Vector, vl>; + using internal_vector_type = KokkosBatched::Vector, il>; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment6; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + view_type A((scalar_type *)A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + internal_vector_view_type A_vector_internal(A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + + vector_view_type B_vector("B_vector", simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + view_type B((scalar_type *)B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + internal_vector_view_type B_vector_internal(B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + + vector_view_type C_vector("C_vector", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + functor_type experiment6_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); + return; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { @@ -1195,7 +1288,7 @@ void do_gemm_experiment_parallel(options_t options) { using TransBType = Trans::NoTranspose; using BlockingType = Algo::Gemm::Unblocked; - __do_loop_and_invoke( +/* __do_loop_and_invoke( options, __do_gemm_parallel_experiment1); __do_loop_and_invoke( @@ -1209,6 +1302,9 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); */ + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment6); } From 441c4d4a6bfaf16b5e9ae28e8e0795ccea3b1c21 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 Feb 2021 11:45:10 -0700 Subject: [PATCH 071/126] perf_test/blas/blas3: Add experiment7 (Simd + TeamGemm) --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 105 +++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f24a1091b7..86b46e5adb 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1116,6 +1116,106 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment7 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) { + auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + }); + } +}; + +template +void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using vector_type = KokkosBatched::Vector, vl>; + using internal_vector_type = KokkosBatched::Vector, il>; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + + vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + + vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + using functor_type = + parallel_batched_gemm_experiment7; + functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + //using functor_type = + // parallel_batched_gemm_experiment7; + // functor_type experiment7_functor(A, B, C, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); + //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); + //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7"); + return; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { @@ -1302,9 +1402,12 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); */ + BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment6); */ + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment7); } From 3c805868b780334ab037d2ebc47ce711f1246cc5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 12:51:37 -0700 Subject: [PATCH 072/126] perf_test/blas/blas3: replace experiment7 with batched_team_simd --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 5 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 395 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 19 +- 3 files changed, 261 insertions(+), 158 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index a2c1e6f6ae..b398ed62aa 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -137,6 +137,8 @@ typedef enum TEST { BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, BATCHED_TEAM_VECTOR_BLOCKED, + BATCHED_TEAM_SIMD, + BATCHED_TEAM_SIMD_BLOCKED, // ADD MORE TEST TYPES HERE EXPERIMENT, TEST_N @@ -145,7 +147,8 @@ typedef enum TEST { static std::string test_e_str[TEST_N]{ "blas", "batched_serial", "batched_serial_blocked", "batched_team", "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 86b46e5adb..91bf649fed 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -74,6 +74,8 @@ void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_team_simd_batched_parallel(options_t options); +void do_gemm_team_simd_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; @@ -82,6 +84,10 @@ struct TeamTag {}; struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; struct TeamVectorBatchDim3Tag {}; +struct TeamSimdTag {}; +struct TeamSimdBatchDim4Tag {}; +// TODO: struct SerialSimdTag {}; +// TODO: struct SerialSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -93,6 +99,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial NULL, NULL, // Team NULL, NULL, // TeamVector + NULL, NULL, // TeamSimd NULL // Serial Experiment }, { @@ -102,6 +109,8 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; @@ -112,6 +121,18 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { using view_type_3d = Kokkos::View; +using view_type_4d = Kokkos::View; + +// Construct the vector type +using memory_space = typename default_device::execution_space::memory_space; +constexpr int simd_vector_size = + KokkosBatched::DefaultVectorLength::value; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, simd_vector_size>; +using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; +using vector_view_type_3d = Kokkos::View; +using internal_vector_view_type_4d = Kokkos::View; struct batched_params { int team_size; @@ -119,12 +140,58 @@ struct batched_params { }; typedef struct batched_params batched_params_t; +/** + * @brief struct gemm_simd_args encapsulates the data types required + * for allocating and passing a single matrix to the KokkosBatched gemm + * kernels. To invoke gemm on a batch of matrices, three instances of this + * struct are required, one for each matrix, A, B, and C. + * + * @var vec_3d: 3-rank view type used for allocating the underlying data. + * A reference must be kept to this object to ensure the + * data is not free'd by the C++ runtime. + * @var mat_4d: 4-rank view type used for populating the simd view with + random values. + * @var ivec_4d: 4-rank view type used for passing to math kernels. This + * view type is used for leveraging simd instructions on + * both the host and device. + */ +struct gemm_simd_args { + vector_view_type_3d vec_3d; + view_type_4d mat_4d; + internal_vector_view_type_4d ivec_4d; +}; +typedef struct gemm_simd_args gemm_simd_args_t; + +/** + * @brief struct gemm_args are common arguments passed to + * both gemm implementations in the KokkosBlas and KokkosBatched + * namespaces throughout these performance tests. + * + * @var transA: transpose type for A matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var transB: transpose type for B matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var alpha: scalar applied to A matrix. + * @var beta: scalar applied to B matrix. + * @var A: 3-rank view type used in all non-simd tests. + * @var B: 3-rank view type used in all non-simd tests. + * @var C: 3-rank view type used in all non-simd tests. + * @var bp: team_size and vector_length for tests that use Kokkos::TeamPolicy. + * @var Av: 3-rank and 4-rank vector view types for simd tests. + * @var Bv: 3-rank and 4-rank vector view types for simd tests. + * @var Cv: 3-rank and 4-rank vector view types for simd tests. + */ struct gemm_args { char transA, transB; default_scalar alpha; default_scalar beta; view_type_3d A, B, C; batched_params_t bp; + // Below are matrices for simd tests + gemm_simd_args_t Av, Bv, Cv; + matrix_dims_t dims; }; typedef struct gemm_args gemm_args_t; @@ -135,15 +202,26 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { +static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { if (std::is_same::value || std::is_same::value || std::is_same::value) - return 2 * a_m * b_k * a_n; + return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each multiply. - return (2 + 6) * a_m * b_k * a_n; + return (2 + 6) * a_m * b_n * a_n; } + +static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) { + std::string x = "x"; + std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); + + if (options.blas_args.batch_size_last_dim) + return ret + x + std::to_string(dim.k); + else + return std::to_string(dim.k) + x + ret; +} + static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { @@ -157,13 +235,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - if (options.blas_args.batch_size_last_dim) { - flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1), - gemm_args.B.extent(1)); - } else { - flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), - gemm_args.B.extent(2)); - } + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, + gemm_args.dims.b.n); gflops = flops / 1e9; @@ -172,12 +245,11 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, << options.blas_args.gemm.beta << "," << ts << "," << vlen << "," - << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) - << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) - << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) - << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) - << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) - << "," << options.warm_up_n << "," << options.n << "," + << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << time_in_seconds / options.n << "," << flops << "," @@ -385,13 +457,34 @@ struct parallel_batched_gemm_range_policy { } KOKKOS_INLINE_FUNCTION - void operator()(const TeamTag &, const int &i) const {} + void operator()(const TeamTag &, const int &i) const { + Kokkos::abort("TeamTag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamBatchDim3Tag &, const int &i) const {} + void operator()(const TeamBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamVectorTag &, const int &i) const {} + void operator()(const TeamVectorTag &, const int &i) const { + Kokkos::abort("TeamVectorTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const int &i) const { + Kokkos::abort("TeamSimdTag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {} + void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { + Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); + } }; template ::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + }); + } }; template ::value || std::is_same::value) { return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); } + if (std::is_same::value || std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len? + } + STATUS; functor_type parallel_batched_gemm_functor(gemm_args); - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; if (options.blas_args.use_auto) { for (uint32_t i = 0; i < warm_up_n; i++) { @@ -965,7 +1087,7 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, vl>; + using simd_type = KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -1051,6 +1173,7 @@ class parallel_batched_gemm_experiment6 { template void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { +#if 0 using execution_space = typename device_type::execution_space; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; @@ -1061,8 +1184,6 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { KokkosBatched::DefaultVectorLength::value; constexpr int il = KokkosBatched::DefaultInternalVectorLength::value; - using vector_type = KokkosBatched::Vector, vl>; - using internal_vector_type = KokkosBatched::Vector, il>; using view_type = Kokkos::View; using vector_view_type = Kokkos::View; using internal_vector_view_type = Kokkos::View; @@ -1113,112 +1234,13 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { } __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); - return; -} - -template -class parallel_batched_gemm_experiment7 { - private: - SimdViewType &A, &B, &C; - gemm_args_t gemm_args; - - public: - parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B, - SimdViewType &_C, gemm_args_t _gemm_args) - : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const MemberType &member) const { - auto i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) { - auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - - KokkosBatched::TeamGemm::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); - }); - } -}; - -template -void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) { - using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - // Construct the vector type - using scalar_type = typename view_type_3d::value_type; - constexpr int vl = - KokkosBatched::DefaultVectorLength::value; - constexpr int il = - KokkosBatched::DefaultInternalVectorLength::value; - using vector_type = KokkosBatched::Vector, vl>; - using internal_vector_type = KokkosBatched::Vector, il>; - using view_type = Kokkos::View; - using vector_view_type = Kokkos::View; - using internal_vector_view_type = Kokkos::View; - - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - auto k = options.start.c.k; - Kokkos::Timer timer; - auto simd_batch_size = k / vl + (k % vl > 0); - STATUS; - - // Construct matrices - vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - - vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - - vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - - uint64_t seed = Kokkos::Impl::clock_tic(); - Kokkos::Random_XorShift64_Pool rand_pool(seed); - Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fence(); - - using functor_type = - parallel_batched_gemm_experiment7; - functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); - - //using functor_type = - // parallel_batched_gemm_experiment7; - // functor_type experiment7_functor(A, B, C, gemm_args); - - for (uint32_t i = 0; i < warm_up_n; ++i) { - Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm", - policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); - //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); - Kokkos::fence(); - } - - timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm", - policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); - //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); - Kokkos::fence(); - } - - __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7"); +#endif return; } /*************************** Internal setup fns **************************/ template -gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { +gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; @@ -1226,32 +1248,83 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; + gemm_args.dims = dims; gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - if (options.blas_args.batch_size_last_dim) { - gemm_args.A = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k); - gemm_args.B = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k); - gemm_args.C = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k); + if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) { + // Calculate the batch size for simd views + auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + + // Reference gemm simd arguments for allocating A, B, and C matrices + gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; + + if (options.blas_args.batch_size_last_dim) { + // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views) + A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size); + A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); + + B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size); + B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); + + C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size); + C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + + } else { + // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views) + A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n); + A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n); + B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n); + C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size); + } + + // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments + Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); } else { - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + } + + Kokkos::fill_random(gemm_args.A, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.B, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.C, rand_pool, + Kokkos::rand, + scalar_type>::max()); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fill_random(gemm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, - Kokkos::rand, - scalar_type>::max()); - return gemm_args; } @@ -1265,7 +1338,8 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + << ", DEVICE:" << typeid(default_device).name() + << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1375,6 +1449,34 @@ void do_gemm_team_vector_batched_parallel(options_t options) { return; } +void do_gemm_team_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + + +// Blocked algo not yet implemented for TeamVectorGemm. /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( @@ -1388,7 +1490,7 @@ void do_gemm_experiment_parallel(options_t options) { using TransBType = Trans::NoTranspose; using BlockingType = Algo::Gemm::Unblocked; -/* __do_loop_and_invoke( + __do_loop_and_invoke( options, __do_gemm_parallel_experiment1); __do_loop_and_invoke( @@ -1405,9 +1507,6 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment6); */ - __do_loop_and_invoke( - options, __do_gemm_parallel_experiment7); } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 72a92a32b1..17aac3d526 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -74,7 +74,7 @@ static void __print_help_blas3_perf_test() { printf("Options:\n"); printf("\t-h, --help\n"); - printf("\t\tPrint this help menu.\n\n"); + printf("\t\tPrint this help menu.\n"); printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); @@ -145,7 +145,7 @@ static void __print_help_blas3_perf_test() { printf("%c[1m", 27); printf("\t\t\t\tparallel:"); printf("%c[0m", 27); - printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); + printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n"); printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); printf( @@ -153,7 +153,7 @@ static void __print_help_blas3_perf_test() { "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); @@ -163,7 +163,7 @@ static void __print_help_blas3_perf_test() { "(stop)\n"); printf( "\t\t\tValid dimension values are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); @@ -171,34 +171,34 @@ static void __print_help_blas3_perf_test() { printf("\t\tMatrix step selection.\n"); printf( "\t\t\tValid value for K is any non-negative 32-bit integer. (default: " - "%d)\n\n", + "%d)\n", DEFAULT_STEP); printf("\t-w, --warm_up_loop=LOOP\n"); printf("\t\tWarm up loop selection. (untimed)\n"); printf( "\t\t\tValid value for LOOP is any non-negative 32-bit integer that's <= " - "ITER. (default: %d)\n\n", + "ITER. (default: %d)\n", DEFAULT_WARM_UP_N); printf("\t-i, --iter=ITER\n"); printf("\t\tIteration selection. (timed)\n"); printf( "\t\t\tValid value for ITER is any non-negative 32-bit integer. " - "(default: %d)\n\n", + "(default: %d)\n", DEFAULT_N); printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( "\t\t\tValid value for /path/to/file.csv is any valid file name. " - "(default: stdout)\n\n"); + "(default: stdout)\n"); printf("\t-r, --routines=ROUTINES\n"); printf("\t\tRoutine selection.\n"); printf( "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " - "delimited by a comma. (default: %s)\n\n", + "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); } @@ -250,6 +250,7 @@ int main(int argc, char **argv) { options.blas_args.team_size = DEFAULT_TEAM_SIZE; options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.use_auto = DEFAULT_USE_AUTO; + options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; From b5c7b88b1682e9eeb19d8358582f5b5df042a340 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 12:52:01 -0700 Subject: [PATCH 073/126] perf_test/batched: Add README.md --- perf_test/batched/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 perf_test/batched/README.md diff --git a/perf_test/batched/README.md b/perf_test/batched/README.md new file mode 100644 index 0000000000..ca5920ae39 --- /dev/null +++ b/perf_test/batched/README.md @@ -0,0 +1 @@ +Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`. From d9e9d04d2005334ed2638b37d149cc67fa43eee7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:14:04 -0700 Subject: [PATCH 074/126] perf_test/blas/blas3: Add last gemm test types - Added serial simd test types. - Added serial compact mkl test type. --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 29 ++++-- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++++---- 2 files changed, 97 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index b398ed62aa..d37f11eea9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -119,20 +119,28 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** * @var BLAS: Run the blas routine through the - * KokkosBlas namespace. + * KokkosBlas namespace. * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the * KokkosBatched namespace. + * @var BATCHED_SERIAL_SIMD{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace using SIMD views. + * @var BATCHED_SERIAL_COMPACT_MKL: Run the serial blas mkl routine through + * the KokkosBatched namespace. * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the - * KokkosBatched namespace. + * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through - * the KokkosBatched namespace. - * @var EXPERIMENT: Run the blas routine as a custom - * experiment. + * the KokkosBatched namespace. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through the + * KokkosBatched namespace using SIMD views. + * @var EXPERIMENT: Run the blas routine as a custom experiment. */ typedef enum TEST { BLAS, BATCHED_SERIAL, BATCHED_SERIAL_BLOCKED, + BATCHED_SERIAL_SIMD, + BATCHED_SERIAL_SIMD_BLOCKED, + BATCHED_SERIAL_COMPACT_MKL, BATCHED_TEAM, BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, @@ -145,10 +153,13 @@ typedef enum TEST { } test_e; static std::string test_e_str[TEST_N]{ - "blas", "batched_serial", "batched_serial_blocked", "batched_team", - "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", "batched_team_simd", - "batched_team_simd_blocked", + "blas", + "batched_serial", "batched_serial_blocked", + "batched_serial_simd", "batched_serial_simd_blocked", + "batched_serial_compact_mkl", + "batched_team", "batched_team_blocked", + "batched_team_vector", "batched_team_vector_blocked", + "batched_team_simd", "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 91bf649fed..5fffd02dc8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -70,6 +70,9 @@ void do_gemm_serial_batched_blocked(options_t options); // invocation! void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_serial_simd_batched_parallel(options_t options); +void do_gemm_serial_simd_batched_blocked_parallel(options_t options); +void do_gemm_serial_batched_compact_mkl_parallel(options_t options); void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); @@ -104,8 +107,11 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { }, { NULL, // BLAS - do_gemm_serial_batched_parallel, - do_gemm_serial_batched_blocked_parallel, // Serial + do_gemm_serial_batched_parallel, // Serial + do_gemm_serial_batched_blocked_parallel, + do_gemm_serial_simd_batched_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector @@ -488,7 +494,7 @@ struct parallel_batched_gemm_range_policy { }; template + class BlockingType, class AlgoMode = void> struct parallel_batched_gemm { gemm_args_t gemm_args_; @@ -582,7 +588,7 @@ struct parallel_batched_gemm { auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane); auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane); - KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); }); } @@ -594,7 +600,7 @@ struct parallel_batched_gemm { auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); }); } }; @@ -636,14 +642,14 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar } template + class device_type, class algo_mode = void> void __do_gemm_parallel_batched_template(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; @@ -702,7 +708,7 @@ void __do_gemm_parallel_batched_template(options_t options, return; } -template +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -714,19 +720,19 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -1410,6 +1416,61 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) { return; } +void do_gemm_serial_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { + STATUS; +#if \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); +#else + #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; + #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl; + #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl; + #endif +#endif + return; +} + void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) @@ -1454,11 +1515,11 @@ void do_gemm_team_simd_batched_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); return; } @@ -1467,11 +1528,11 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); return; } From fa23cf75b5b4da16a468dfa9640b8bc84b5d5614 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:19:04 -0700 Subject: [PATCH 075/126] perf_test/blas/blas3: Apply clang-format --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 19 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 480 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 83 +-- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 116 +++-- 4 files changed, 405 insertions(+), 293 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index d37f11eea9..a991efe61e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -130,9 +130,10 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through * the KokkosBatched namespace. - * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through the - * KokkosBatched namespace using SIMD views. - * @var EXPERIMENT: Run the blas routine as a custom experiment. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace using SIMD views. + * @var EXPERIMENT: Run the blas routine as a custom + * experiment. */ typedef enum TEST { BLAS, @@ -153,13 +154,11 @@ typedef enum TEST { } test_e; static std::string test_e_str[TEST_N]{ - "blas", - "batched_serial", "batched_serial_blocked", - "batched_serial_simd", "batched_serial_simd_blocked", - "batched_serial_compact_mkl", - "batched_team", "batched_team_blocked", - "batched_team_vector", "batched_team_vector_blocked", - "batched_team_simd", "batched_team_simd_blocked", + "blas", "batched_serial", "batched_serial_blocked", "batched_serial_simd", + "batched_serial_simd_blocked", "batched_serial_compact_mkl", "batched_team", + "batched_team_blocked", "batched_team_vector", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 5fffd02dc8..3e55a85799 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -106,16 +106,16 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { NULL // Serial Experiment }, { - NULL, // BLAS - do_gemm_serial_batched_parallel, // Serial + NULL, // BLAS + do_gemm_serial_batched_parallel, // Serial do_gemm_serial_batched_blocked_parallel, do_gemm_serial_simd_batched_parallel, - do_gemm_serial_simd_batched_blocked_parallel, - do_gemm_serial_batched_compact_mkl_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector - do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_parallel, do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; @@ -123,22 +123,29 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 -#define DEFAULT_GEMM_BETA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; -using view_type_4d = Kokkos::View; +using view_type_4d = + Kokkos::View; // Construct the vector type using memory_space = typename default_device::execution_space::memory_space; constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; -constexpr int simd_internal_vector_size = - KokkosBatched::DefaultInternalVectorLength::value; -using vector_type = KokkosBatched::Vector, simd_vector_size>; -using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; -using vector_view_type_3d = Kokkos::View; -using internal_vector_view_type_4d = Kokkos::View; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, + simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, + simd_internal_vector_size>; +using vector_view_type_3d = + Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -151,14 +158,14 @@ typedef struct batched_params batched_params_t; * for allocating and passing a single matrix to the KokkosBatched gemm * kernels. To invoke gemm on a batch of matrices, three instances of this * struct are required, one for each matrix, A, B, and C. - * + * * @var vec_3d: 3-rank view type used for allocating the underlying data. * A reference must be kept to this object to ensure the * data is not free'd by the C++ runtime. * @var mat_4d: 4-rank view type used for populating the simd view with random values. * @var ivec_4d: 4-rank view type used for passing to math kernels. This - * view type is used for leveraging simd instructions on + * view type is used for leveraging simd instructions on * both the host and device. */ struct gemm_simd_args { @@ -184,11 +191,12 @@ typedef struct gemm_simd_args gemm_simd_args_t; * @var A: 3-rank view type used in all non-simd tests. * @var B: 3-rank view type used in all non-simd tests. * @var C: 3-rank view type used in all non-simd tests. - * @var bp: team_size and vector_length for tests that use Kokkos::TeamPolicy. + * @var bp: team_size and vector_length for tests that use + * Kokkos::TeamPolicy. * @var Av: 3-rank and 4-rank vector view types for simd tests. * @var Bv: 3-rank and 4-rank vector view types for simd tests. * @var Cv: 3-rank and 4-rank vector view types for simd tests. - */ + */ struct gemm_args { char transA, transB; default_scalar alpha; @@ -207,19 +215,22 @@ static std::string gemm_csv_header_str = "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ -// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { - if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return 2 * a_m * b_n * a_n; - else - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. - return (2 + 6) * a_m * b_n * a_n; + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_n * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return (2 + 6) * a_m * b_n * a_n; } -static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) { - std::string x = "x"; +static inline std::string __gemm_output_dim_string(options_t options, + matrix_dim_t dim) { + std::string x = "x"; std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); if (options.blas_args.batch_size_last_dim) @@ -232,8 +243,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; - std::string ts = std::to_string(gemm_args.bp.team_size); - std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; @@ -241,26 +252,22 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, - gemm_args.dims.b.n); + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, + gemm_args.dims.a.n, + gemm_args.dims.b.n); gflops = flops / 1e9; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," - << ts << "," - << vlen << "," - << loop_e_str[options.loop] << "," - << __gemm_output_dim_string(options, gemm_args.dims.a) << "," - << __gemm_output_dim_string(options, gemm_args.dims.b) << "," - << __gemm_output_dim_string(options, gemm_args.dims.c) << "," - << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," - << time_in_seconds / options.n << "," - << flops << "," - << gflops / average_time - << std::endl; + << options.blas_args.gemm.beta << "," << ts << "," << vlen + << "," << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n << "," + << flops << "," << gflops / average_time << std::endl; } static void __print_gemm_perf_test_options(options_t options) { @@ -435,12 +442,12 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { return; } -template +template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; - parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION void operator()(const SerialTag &, const int &i) const { @@ -470,27 +477,27 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const TeamBatchDim3Tag &, const int &i) const { Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const int &i) const { Kokkos::abort("TeamVectorTag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamSimdTag &, const int &i) const { Kokkos::abort("TeamSimdTag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); - } + } }; template ::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); - }); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); } KOKKOS_INLINE_FUNCTION - void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const { + void operator()(const TeamSimdBatchDim4Tag &, + const MemberType &member) const { auto i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) { - auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - - KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); - }); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, simd_vector_size), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); } }; template -void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, + gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; using policy_type = Kokkos::RangePolicy; using functor_type = @@ -623,16 +649,16 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); Kokkos::fence(); } @@ -649,22 +675,30 @@ void __do_gemm_parallel_batched_template(options_t options, using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; auto league_size = options.start.c.k; - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; Kokkos::Timer timer; - if (std::is_same::value || std::is_same::value) { - return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + if (std::is_same::value || + std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy< + TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, + gemm_args); } - if (std::is_same::value || std::is_same::value) { - league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0); - vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len? + if (std::is_same::value || + std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.ivec_4d.extent(3) + : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size / + simd_internal_vector_size; // TODO: use bp.vector_len? } STATUS; @@ -674,31 +708,31 @@ void __do_gemm_parallel_batched_template(options_t options, if (options.blas_args.use_auto) { for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), - parallel_batched_gemm_functor); + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), - parallel_batched_gemm_functor); + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); Kokkos::fence(); } } else { for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); Kokkos::fence(); } } @@ -708,7 +742,8 @@ void __do_gemm_parallel_batched_template(options_t options, return; } -template +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -720,19 +755,23 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -1093,7 +1132,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, simd_vector_size>; + using simd_type = + KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -1118,12 +1158,12 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { // uint64_t seed = Kokkos::Impl::clock_tic(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(B, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(C, rand_pool, - // Kokkos::rand, simd_type>::max()); - // execution_space::fence(); + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, + // simd_type>::max()); execution_space::fence(); functor_type experiment5_functor(A, B, C, gemm_args); @@ -1151,8 +1191,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } -template +template class parallel_batched_gemm_experiment6 { private: SimdViewType &A, &B, &C; @@ -1165,14 +1205,16 @@ class parallel_batched_gemm_experiment6 { KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { - auto i = member.league_rank(); + auto i = member.league_rank(); auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); // Uses two serial for-loops internally - KokkosBatched::TeamVectorGemm::invoke( - member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + KokkosBatched::TeamVectorGemm::invoke(member, gemm_args.alpha, + svA, svB, + gemm_args.beta, svC); } }; @@ -1254,77 +1296,111 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; - gemm_args.dims = dims; - gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; - gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) { + gemm_args.dims = dims; + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + if (options.test == BATCHED_TEAM_SIMD || + options.test == BATCHED_TEAM_SIMD_BLOCKED) { // Calculate the batch size for simd views - auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); - auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); - auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + auto a_simd_batch_size = + dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = + dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = + dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); // Reference gemm simd arguments for allocating A, B, and C matrices gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; if (options.blas_args.batch_size_last_dim) { - // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views) - A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size); - A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); - A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); - - B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size); - B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); - B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); - - C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size); - C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); - C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + // Construct simd matrices with batch_size in the last dimension (better + // for LayoutLeft views) + A.vec_3d = vector_view_type_3d("A_vector", dims.a.m, dims.a.n, + a_simd_batch_size); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), simd_vector_size, + dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.a.m, dims.a.n, + a_simd_batch_size); + + B.vec_3d = vector_view_type_3d("B_vector", dims.b.m, dims.b.n, + b_simd_batch_size); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), simd_vector_size, + dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.b.m, dims.b.n, + b_simd_batch_size); + + C.vec_3d = vector_view_type_3d("C_vector", dims.c.m, dims.c.n, + c_simd_batch_size); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), simd_vector_size, + dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.c.m, dims.c.n, + c_simd_batch_size); } else { - // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views) - A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n); - A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size); - A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size); - - B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n); - B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size); - B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size); - - C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n); - C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size); - C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size); + // Construct simd matrices with batch_size in the first dimension (better + // for LayoutRight views) + A.vec_3d = vector_view_type_3d("A_vector", a_simd_batch_size, dims.a.m, + dims.a.n); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), a_simd_batch_size, + dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, + dims.a.n, simd_vector_size / simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d("B_vector", b_simd_batch_size, dims.b.m, + dims.b.n); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), b_simd_batch_size, + dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, + dims.b.n, simd_vector_size / simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d("C_vector", c_simd_batch_size, dims.c.m, + dims.c.n); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), c_simd_batch_size, + dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, + dims.c.n, simd_vector_size / simd_internal_vector_size); } - // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments + // Use the non-simd 4-rank view type to randomly populate the gemm simd + // arguments Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); } else { if (options.blas_args.batch_size_last_dim) { - gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); - gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); - gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); } else { - gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); - gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); - gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } Kokkos::fill_random(gemm_args.A, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.B, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.C, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; @@ -1344,7 +1420,7 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() + << ", DEVICE:" << typeid(default_device).name() << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1394,8 +1470,9 @@ void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); #else - #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; - #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl; - #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl; - #endif +#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + "undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ " + "is undefined." + << std::endl; +#endif #endif return; } @@ -1475,8 +1562,9 @@ void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + options, __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_simd_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched(alpha); - options.blas_args.gemm.beta = static_cast(beta); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index a313eabbaf..f84479d26e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -78,34 +78,36 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * assumes KokkosBatched::SerialTrmm is being used. Since the dot products * do a multiply and add we can calculate the flops for any element in the last * column of the LHS to be 2*columns_LHS, any element in the last-1 column of - * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS - * giving us this flop count: - * flops = columns_LHS * (columns_LHS + 1) - * flops = (flops / 2) * 2 - * flops = flops * rows_LHS + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the + * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops + * = (flops / 2) * 2 flops = flops * rows_LHS */ -static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { int flops; if (side == 'L' || side == 'l') { - flops = (b_m * (b_m + 1)) * b_n; + flops = (b_m * (b_m + 1)) * b_n; } else { - flops = (b_n * (b_n + 1)) * b_m; + flops = (b_n * (b_n + 1)) * b_m; } if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return flops; + std::is_same::value || + std::is_same::value) + return flops; // Account for 6 additional flops when complex numbers are used. // Above we have counted 1 flop for each add and 1 flop for each multiply. - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. return flops * 4; } -// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -115,13 +117,14 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) } if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return flops; + std::is_same::value || + std::is_same::value) + return flops; // Account for 6 additional flops when complex numbers are used. // Above we have counted 1 flop for each add and 1 flop for each multiply. - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. return flops * 4; } @@ -136,17 +139,21 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/" + "average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { - double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, - trmm_args.B.extent(1), trmm_args.B.extent(2), - trmm_args.A.extent(1), trmm_args.A.extent(2)); - double gflops = flops / 1e9; - double average_time = time_in_seconds / options.n; - double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + double flops = trmm_args.A.extent(0) * + trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * sizeof(default_scalar)) / + 1e9; double min_memory_transactions, max_memory_transactions; // Assuming infinite cache size @@ -155,26 +162,29 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, min_memory_transactions = 3; // Assuming no register or real caching - // We have to go out to memory for every element we read from A and B as well as - // every element we write to B. - // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the - // write to B since this flop count is for one multiply and one add. + // We have to go out to memory for every element we read from A and B as well + // as every element we write to B. We use the trmm flops from lapack note 41 + // and multiple by 3/2 to account for the write to B since this flop count is + // for one multiply and one add. if (trmm_args.side == 'l' || trmm_args.side == 'L') - max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.); + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * (3. / 2.); else - max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.); + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * + trmm_args.B.extent(1) * (3. / 2.); options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) + << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) + << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << average_time << "," - << flops << "," - << gflops / average_time << "," - << (gbytes_in_matrix * min_memory_transactions) / average_time << "," + << average_time << "," << flops << "," << gflops / average_time + << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time + << "," << (gbytes_in_matrix * max_memory_transactions) / average_time << std::endl; } @@ -218,7 +228,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + &trmm_args.diag, trmm_args.alpha, A, B); } // Fence after submitting each batch operation Kokkos::fence(); @@ -231,7 +241,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + &trmm_args.diag, trmm_args.alpha, A, B); } // Fence after submitting each batch operation Kokkos::fence(); @@ -412,18 +422,20 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; for (uint32_t j = 0; j < warm_up_n; ++j) { - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_blas_trmm_functor); + Kokkos::parallel_for( + "parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); // Fence after each batch operation Kokkos::fence(); } timer.reset(); for (uint32_t j = 0; j < n; ++j) { - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_blas_trmm_functor); + Kokkos::parallel_for( + "parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); // Fence after each batch operation Kokkos::fence(); } @@ -470,18 +482,20 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; for (uint32_t j = 0; j < warm_up_n; ++j) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_batched_trmm_functor); + Kokkos::parallel_for( + "parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); // Fence after each batch operation Kokkos::fence(); } timer.reset(); for (uint32_t j = 0; j < n; ++j) { - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_batched_trmm_functor); + Kokkos::parallel_for( + "parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); // Fence after each batch operation Kokkos::fence(); } From e5fb960c340f628242d0266f8dd9f03608d715c2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:20:25 -0700 Subject: [PATCH 076/126] perf_test/blas/blas3: Allocate simd views - Allocate simd views for serial simd tests. --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 3e55a85799..74f0771062 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1300,7 +1300,9 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; if (options.test == BATCHED_TEAM_SIMD || - options.test == BATCHED_TEAM_SIMD_BLOCKED) { + options.test == BATCHED_TEAM_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_SIMD || + options.test == BATCHED_SERIAL_SIMD_BLOCKED) { // Calculate the batch size for simd views auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); From 30d54723e3d926ed3f65a3db521fec1929df27c9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 08:27:47 -0700 Subject: [PATCH 077/126] perf_test/blas/blas3: Update compact mkl functors --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 74f0771062..d646653697 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -83,14 +83,14 @@ void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; struct SerialBatchDim3Tag {}; +struct SerialSimdTag {}; +struct SerialSimdBatchDim3Tag {}; struct TeamTag {}; struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; struct TeamVectorBatchDim3Tag {}; struct TeamSimdTag {}; struct TeamSimdBatchDim4Tag {}; -// TODO: struct SerialSimdTag {}; -// TODO: struct SerialSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -469,6 +469,32 @@ struct parallel_batched_gemm_range_policy { gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const int &i) const { Kokkos::abort("TeamTag not supported using RangePolicy."); @@ -686,7 +712,9 @@ void __do_gemm_parallel_batched_template(options_t options, Kokkos::Timer timer; if (std::is_same::value || - std::is_same::value) { + std::is_same::value || + std::is_same::value || + std::is_same::value) { return __do_gemm_parallel_batched_template_range_policy< TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, gemm_args); @@ -1302,7 +1330,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED || options.test == BATCHED_SERIAL_SIMD || - options.test == BATCHED_SERIAL_SIMD_BLOCKED) { + options.test == BATCHED_SERIAL_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_COMPACT_MKL) { // Calculate the batch size for simd views auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); @@ -1532,11 +1561,11 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); #else #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) From 2401e9dbde85183f7655cc955e4375f2f48d34f3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 09:01:55 -0700 Subject: [PATCH 078/126] perf_test/blas/blas3: Added operators for SerialSimd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d646653697..c2f3f58ced 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -654,6 +654,16 @@ struct parallel_batched_gemm { svC); }); } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const MemberType &member) const { + Kokkos::abort("SerialSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const { + Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); + } }; template Date: Wed, 3 Mar 2021 15:34:21 -0700 Subject: [PATCH 079/126] perf_test/blas/blas3: Fix compactMKL batch size --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index c2f3f58ced..b575bc186b 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -677,15 +677,23 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); + if (std::is_same::value || + std::is_same::value) { + batch_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } + for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), + policy_type(0, batch_size), parallel_batched_gemm_functor); Kokkos::fence(); } @@ -693,7 +701,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), + policy_type(0, batch_size), parallel_batched_gemm_functor); Kokkos::fence(); } From 1eab5b4f04754ddbe18038a13733fec5bbc6176f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 12:56:01 -0700 Subject: [PATCH 080/126] perf_test/blas: Fix internal function names --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 6 +++--- perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 32626cfba5..e6abeaefc4 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int trtri_impl_flop_count(int a_m, int a_n) { +static inline int __trtri_impl_flop_count(int a_m, int a_n) { int flop_count = 0; int flops_per_div, flops_per_mul, flops_per_add; @@ -109,7 +109,7 @@ static inline int trtri_impl_flop_count(int a_m, int a_n) { } // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trtri_flop_count(int a_m, int a_n) { +static inline int __trtri_flop_count(int a_m, int a_n) { int flops; int flops_per_mul; int flops_per_add; @@ -151,7 +151,7 @@ static std::string trtri_csv_header_str = /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { - double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index f84479d26e..bd6392cf06 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops * = (flops / 2) * 2 flops = flops * rows_LHS */ -static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, +static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; @@ -106,7 +106,7 @@ static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, +static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; @@ -146,7 +146,7 @@ static std::string trmm_csv_header_str = static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { double flops = trmm_args.A.extent(0) * - trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), trmm_args.B.extent(2), trmm_args.A.extent(1), trmm_args.A.extent(2)); double gflops = flops / 1e9; From c7e4f5437c31c7f9c52928adcc8d31260b6418ea Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 15:36:31 -0700 Subject: [PATCH 081/126] perf_test/blas/blas3: Apply clang-format --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 44 ++++++++++--------- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 8 ++-- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b575bc186b..d7f2143dc6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -471,12 +471,12 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const SerialSimdTag &, const int &i) const { - auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); - auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); - auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); @@ -484,12 +484,12 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { - auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); @@ -661,7 +661,8 @@ struct parallel_batched_gemm { } KOKKOS_INLINE_FUNCTION - void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const { + void operator()(const SerialSimdBatchDim3Tag &, + const MemberType &member) const { Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); } }; @@ -677,7 +678,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; - auto batch_size = options.start.c.k; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; @@ -687,9 +688,9 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, if (std::is_same::value || std::is_same::value) { batch_size = options.blas_args.batch_size_last_dim - ? gemm_args.Cv.vec_3d.extent(2) - : gemm_args.Cv.vec_3d.extent(0); - } + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", @@ -1579,12 +1580,13 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); #else #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) std::cerr diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index bd6392cf06..86714b7e30 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -83,7 +83,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * = (flops / 2) * 2 flops = flops * rows_LHS */ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -107,7 +107,7 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -147,8 +147,8 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { double flops = trmm_args.A.extent(0) * __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), - trmm_args.B.extent(2), trmm_args.A.extent(1), - trmm_args.A.extent(2)); + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * From 147783e45bf8aeab0e8e6e37ee5952538fd9904b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 10:26:14 -0700 Subject: [PATCH 082/126] perf_test/blas/blas3: Fix -d 1 for team and serial simd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d7f2143dc6..b7be38fdb9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -639,7 +639,7 @@ struct parallel_batched_gemm { const MemberType &member) const { auto i = member.league_rank(); Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, simd_vector_size), + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(0)), [&](const int &vector_lane) { auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); From e3efd455be26670110cadb517724111dc86c3ba0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 11:24:44 -0700 Subject: [PATCH 083/126] perf_test/blas/blas3: Update serial simd to use RangePolicy --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b7be38fdb9..09c3d27465 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1549,11 +1549,11 @@ void do_gemm_serial_simd_batched_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } @@ -1563,11 +1563,11 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } From 0127243a0363dd3bceb4dac90a95054a98656e6f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 11:31:09 -0700 Subject: [PATCH 084/126] perf_test/blas: Update flop counts to use double --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 14 +++++++------- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6abeaefc4..d60f15b92b 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,9 +82,9 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int __trtri_impl_flop_count(int a_m, int a_n) { - int flop_count = 0; - int flops_per_div, flops_per_mul, flops_per_add; +static inline double __trtri_impl_flop_count(double a_m, double a_n) { + double flop_count = 0; + double flops_per_div, flops_per_mul, flops_per_add; if (std::is_same::value || std::is_same::value || @@ -109,10 +109,10 @@ static inline int __trtri_impl_flop_count(int a_m, int a_n) { } // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __trtri_flop_count(int a_m, int a_n) { - int flops; - int flops_per_mul; - int flops_per_add; +static inline double __trtri_flop_count(double a_m, double a_n) { + double flops; + double flops_per_mul; + double flops_per_add; if (a_m != a_n) { fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 09c3d27465..36132db261 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -217,7 +217,7 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { +static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { if (std::is_same::value || std::is_same::value || std::is_same::value) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 86714b7e30..6d67e96bd1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -106,9 +106,9 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { - int flops; +static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m, + double a_n) { + double flops; if (side == 'L' || side == 'l') { flops = b_m * b_m * b_n; From 4acdaf51142081f32b3139dcfca6aa24f8bf8ccc Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 11:59:04 -0700 Subject: [PATCH 085/126] perf_test/blas/blas3: Added verify option - Implemented verify checks in gemm. Simd verify is still failing when the batch_size is not divisible by the simd_vector_len. --- perf_test/blas/blas3/CMakeLists.txt | 1 + perf_test/blas/blas3/KokkosBlas3_common.hpp | 4 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 166 +++++++++++++++++- .../blas/blas3/KokkosBlas3_perf_test.cpp | 19 +- test_common/KokkosKernels_TestUtils.hpp | 15 ++ 5 files changed, 197 insertions(+), 8 deletions(-) diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index c1e3a117fa..8f83bd6b99 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -1,5 +1,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tpls/gtest) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index a991efe61e..2103d0d57e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -63,6 +63,7 @@ #define DEFAULT_VECTOR_LEN 1 #define DEFAULT_USE_AUTO 0 #define DEFAULT_BATCH_SIZE_LAST_DIM 0 +#define DEFAULT_VERIFY 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -192,6 +193,8 @@ typedef struct matrix_dims matrix_dims_t; * @var out_file: The file to write csv data to. Defaults to stdout. * @var blas_args: Arguments for each supported blas routine. * @var blas_routines: Selects which supported blas routines to test. + * @var verify: Performs verification of the blas routine for each input + * before timing it. */ struct perf_test_options { test_e test; @@ -205,6 +208,7 @@ struct perf_test_options { std::string out_file; blas_args_t blas_args; std::string blas_routines; + bool verify; }; typedef struct perf_test_options options_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 36132db261..df08e30aaa 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,6 +56,8 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include "gtest/gtest.h" // EXPECT_NEAR +#include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -252,6 +254,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; + if (options.verify) + return; + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.b.n); @@ -360,8 +365,8 @@ void __do_gemm_serial_batched_template(options_t options, template void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { - char a = gemm_args.transA; - char b = gemm_args.transB; + char a = toupper(gemm_args.transA); + char b = toupper(gemm_args.transB); using N = Trans::NoTranspose; using T = Trans::Transpose; // using C = Trans::ConjTranspose; @@ -1333,6 +1338,154 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { return; } +/** + * Check difference of scalars expected and actual at indexes i,j,k + * @var expected: The expected result. + * @var actual: The actual result. + * @var epsilon: The tolerance to use when comparing. + * @return true if the comparison fails and false if the comparison succeeds. + */ +static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { + STATUS; + auto diff = static_cast(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k))); + + if (diff > epsilon) { + printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i,j,k,static_cast(expected(i,j,k)), + i,j,k,static_cast(actual(i,j,k)), + diff, + epsilon); + FATAL_ERROR("Comparison failure!"); + return true; + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { + double epsilon = Test::epsilon::value; + STATUS; + + for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t k = 0; k < expected.extent(2); k++) { + if (std::is_same::value) { + return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon); + } + if (std::is_same::value) { + return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon); + } + } + } + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) { + std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl; + decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2)); + STATUS; + return __gemm_do_compare(expected, actual_data); +} + +template +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { + using execution_space = typename DeviceType::execution_space; + // Just create "expected" types using non-simd types. + decltype(gemm_args.C) C_expected; + decltype(gemm_args.A) A_expected; + decltype(gemm_args.B) B_expected; + STATUS; + + if (options.blas_args.batch_size_last_dim) { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k); + } else { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n); + } + + // Initialize "expected" matrices. + if (gemm_args.C.data() != nullptr) { + Kokkos::deep_copy(C_expected, gemm_args.C); + Kokkos::deep_copy(A_expected, gemm_args.A); + Kokkos::deep_copy(B_expected, gemm_args.B); + + Kokkos::fence(); // Ensure that deep_copy has completed + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Inital values mismatch!"); + } else if (gemm_args.Cv.vec_3d.data() != nullptr) { + // TODO: Debug this when batch_size % simd_vector_len != 0. + memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n); + memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n); + memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n); + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.Cv)) + FATAL_ERROR("Inital values mismatch!"); + } else { + FATAL_ERROR("Input arguments are empty!"); + } + + // Populate "expected" matrices via VanillaGemm + Test::Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = toupper(gemm_args.transA) == 'T'; + vgemm.B_t = toupper(gemm_args.transB) == 'T'; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. + auto warm_up_n_bak = options.warm_up_n; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; + fn(options, gemm_args); + + Kokkos::fence(); // Redundant fence. + + // Check the result + if (gemm_args.C.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Result value mismatch!"); + } + + if (gemm_args.Cv.vec_3d.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.Cv)) + FATAL_ERROR("Result value mismatch!"); + } + + // Run actual timed test. + options.verify = false; // Set verify to false for csv output. + options.warm_up_n = warm_up_n_bak; + options.n = n_bak; + fn(options, gemm_args); + + // Reset verify for next matrix size. + options.verify = true; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { @@ -1457,6 +1610,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; + Kokkos::fence(); // Ensure that fill_random has completed. + return gemm_args; } @@ -1484,7 +1639,12 @@ void __do_loop_and_invoke(options_t options, cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); - fn(options, gemm_args); + + if (options.verify) { + __gemm_do_verify(options, gemm_args, fn); + } else { + fn(options, gemm_args); + } } return; } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index daf68180c2..73f5a18452 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -68,6 +68,7 @@ static struct option long_options[] = { {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, {"batch_size_last_dim", required_argument, 0, 'd'}, + {"verify", required_argument, 0, 'v'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -122,23 +123,23 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); - printf("\t-u, --use_auto={0,1}\n"); + printf("\t-u, --use_auto=AUTO\n"); printf( "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " "(Heirarchical parallelism).\n"); printf( - "\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size " - "will be used. (default: %d)\n", + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size " + "instead. (default: %d)\n", DEFAULT_USE_AUTO); printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); - printf("\t-d, --batch_size_last_dim={0,1}\n"); + printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); printf("\t\tHow to allocate the batch_size in the matrices.\n"); printf( - "\t\t\t1 make the batch_size the last dimension, otherwise batch_size is " + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size " "the first dimension (default: %d)\n", DEFAULT_BATCH_SIZE_LAST_DIM); @@ -207,6 +208,13 @@ static void __print_help_blas3_perf_test() { "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); + + printf("\t-v, --verify=VERIFY\n"); + printf("\t\tVerification selection. (untimed)\n"); + printf( + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. " + "(default: %d)\n", + DEFAULT_VERIFY); } static void __blas3_perf_test_input_error(char **argv, char short_opt, @@ -258,6 +266,7 @@ int main(int argc, char **argv) { options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.use_auto = DEFAULT_USE_AUTO; options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; + options.verify = DEFAULT_VERIFY; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 20a568bbc1..8ad7fe22af 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -202,5 +202,20 @@ namespace Test { *this); } }; + + template + class epsilon { + public: + constexpr static double value = std::numeric_limits::epsilon(); + }; + + // explicit epsilon specializations + #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + template + class epsilon { + public: + constexpr static double value = 0009765625F; + }; + #endif // KOKKOS_HALF_T_IS_FLOAT } #endif From 0de685f74269557950aae2271c74d52d26d5c94f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 12:49:54 -0700 Subject: [PATCH 086/126] test_common: Fix half_t epsilon specialization --- test_common/KokkosKernels_TestUtils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 8ad7fe22af..64b3902ec7 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -211,8 +211,8 @@ namespace Test { // explicit epsilon specializations #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - template - class epsilon { + template<> + class epsilon { public: constexpr static double value = 0009765625F; }; From 29322e8cac1f82484ba3c01120e16d9c706035d3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 14:45:00 -0700 Subject: [PATCH 087/126] perf_test/blas/blas3: Use TeamPolicy for serial simd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index df08e30aaa..77d5850fab 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1706,28 +1706,32 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) { void do_gemm_serial_simd_batched_parallel(options_t options) { STATUS; + // SerialBatchDim3Tag + // SerialSimdTag if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { STATUS; + // SerialBatchDim3Tag + // SerialSimdTag if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } From 80ca02ebc6efc905bccf7036a419f1fce6ee414e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 11:49:49 -0700 Subject: [PATCH 088/126] perf_test/blas/blas3: Process verify option --- perf_test/blas/blas3/KokkosBlas3_perf_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 73f5a18452..7e1cdf0f2f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -275,7 +275,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -397,6 +397,7 @@ int main(int argc, char **argv) { atoi(optarg); break; case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; + case 'v': options.verify = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'u': options.blas_args.use_auto = atoi(optarg); break; From 239d44de0fbee7a17a51d3d8c28cc7f1497ea0e0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 12:02:24 -0700 Subject: [PATCH 089/126] perf_test/blas/blas3: Relax epsilon --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 77d5850fab..114cc49422 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1369,7 +1369,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type */ template static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { - double epsilon = Test::epsilon::value; + double epsilon = Test::epsilon::value * 1e3; STATUS; for (size_t i = 0; i < expected.extent(0); i++) { From 55e3eb30670202eb2eab54d261799fe2c3c5c84e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 12:05:25 -0700 Subject: [PATCH 090/126] perf_test/blas/blas3: Add TODO for bug --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 114cc49422..d38bfccd60 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -312,6 +312,7 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + // TODO: Debug this when starting a matrix sizes <= 10x10 KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, A, B, _gemm_args.beta, C); } From 53aa6536ca7643a111383d4f7ae1fe4d65af5857 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 09:03:41 -0700 Subject: [PATCH 091/126] perf_test/blas/blas3: Fix verify for simd when batch_size is first dim --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d38bfccd60..ad01d9acad 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -131,6 +131,8 @@ using view_type_3d = Kokkos::View; using view_type_4d = Kokkos::View; +using view_type_5d = + Kokkos::View; // Construct the vector type using memory_space = typename default_device::execution_space::memory_space; @@ -1402,6 +1404,38 @@ static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t act return __gemm_do_compare(expected, actual_data); } +template +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { + using scalar_type = typename dstViewType::value_type; + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + + if (options.blas_args.batch_size_last_dim) { + exit(255); // TODO + } else { + size_t remainder = dst.extent(0) % simd_vector_size; + if (remainder > 0) { + // The below loops map a given 2-rank gemm within the simd view back to the + // 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { + auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { + auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); + } + } + } + } + } + } else { + memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + } + } +} + template static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; @@ -1433,10 +1467,9 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Inital values mismatch!"); } else if (gemm_args.Cv.vec_3d.data() != nullptr) { - // TODO: Debug this when batch_size % simd_vector_len != 0. - memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n); - memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n); - memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n); + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); // Check that initial values match if (__gemm_do_compare(C_expected, gemm_args.Cv)) From 192fde6a76d975ed5c324f97bb46ca9e0545b24e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 10:49:00 -0700 Subject: [PATCH 092/126] perf_test/blas/blas3: Complete verify for batch_size in first dimension --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index ad01d9acad..a1e870e4c0 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1375,33 +1375,29 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) double epsilon = Test::epsilon::value * 1e3; STATUS; - for (size_t i = 0; i < expected.extent(0); i++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t k = 0; k < expected.extent(2); k++) { - if (std::is_same::value) { - return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon); + if (std::is_same::value) { + for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t k = 0; k < expected.extent(2); k++) { + if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + return true; } - if (std::is_same::value) { - return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon); + } + } + } + + if (std::is_same::value) { + for (size_t k = 0; k < expected.extent(2); k++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t i = 0; i < expected.extent(0); i++) { + if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + return true; } } } } - return false; -} -/** - * Compare all values of expected with all values of actual. - * @var expected: the expected results - * @var actual: the actual results - * @return false if expected matches actual within epsilon, otherwise true. - */ -template -static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) { - std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl; - decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2)); - STATUS; - return __gemm_do_compare(expected, actual_data); + return false; } template @@ -1414,7 +1410,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } else { size_t remainder = dst.extent(0) % simd_vector_size; if (remainder > 0) { - // The below loops map a given 2-rank gemm within the simd view back to the + // The below loops copies each corresponding 2-rank matrix within the simd view back to the // 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); @@ -1431,11 +1427,34 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } } } else { + // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location + // and the data can simply be copied. memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); } } } +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2)); + + STATUS; + + // Copy the simd view to a 3d view for comparision. + // NOTE: The raw results are different when batch_size % simd_vector_size != 0. + // Also note that when batch_size % simd_vector_size != 0, the simd operation + // calculates results that we do not require. + // So, we end up running an extra batch_size % simd_vector_size GEMMs! + __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); + return __gemm_do_compare(expected, actual_data); +} + template static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; @@ -1472,7 +1491,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); // Check that initial values match - if (__gemm_do_compare(C_expected, gemm_args.Cv)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) FATAL_ERROR("Inital values mismatch!"); } else { FATAL_ERROR("Input arguments are empty!"); @@ -1506,7 +1525,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo } if (gemm_args.Cv.vec_3d.data() != nullptr) { - if (__gemm_do_compare(C_expected, gemm_args.Cv)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) FATAL_ERROR("Result value mismatch!"); } From e4351716f2cf7fc4daebcfb60933e488b50b1d1e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:01:35 -0700 Subject: [PATCH 093/126] test_common: Update VanillaGEMM with batch_size_last_dim member --- test_common/KokkosKernels_TestUtils.hpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 64b3902ec7..1d383ffd35 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -159,7 +159,7 @@ namespace Test { // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) template struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c; + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; ViewTypeA A; ViewTypeB B; ViewTypeC C; @@ -177,15 +177,20 @@ namespace Test { auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } using SubviewTypeA = decltype(_A); using SubviewTypeB = decltype(_B); using SubviewTypeC = decltype(_C); struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; - vgemm.C_rows = C.extent(1); - vgemm.C_cols = C.extent(2); - vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t?A.extent(0):A.extent(1)) : (A_t?A.extent(1):A.extent(2)); vgemm.A = _A; vgemm.B = _B; vgemm.C = _C; @@ -198,7 +203,7 @@ namespace Test { void run() { Kokkos::parallel_for( "Test::VanillaGEMM", - Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16), *this); } }; From 07906733755fda24433b974df27e97cc9bf080ca Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:01:47 -0700 Subject: [PATCH 094/126] perf_test/blas/blas3: Add batch_size_last_dim to vgemm --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index a1e870e4c0..8fca4e76b2 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1502,6 +1502,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo vgemm.A_t = toupper(gemm_args.transA) == 'T'; vgemm.B_t = toupper(gemm_args.transB) == 'T'; vgemm.A_c = vgemm.B_c = false; + vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; vgemm.A = A_expected; vgemm.B = B_expected; vgemm.C = C_expected; From 137adccbbee971bfcbf073f6374cdad90a719874 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:45:38 -0700 Subject: [PATCH 095/126] perf_test/blas/blas3: Update compare routines - Handle simd with batch_size in last dimension - Work with device views --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 8fca4e76b2..f560690e54 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1350,12 +1350,14 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { */ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { STATUS; - auto diff = static_cast(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k))); + typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); + auto diff = static_cast(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k))); if (diff > epsilon) { printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", - i,j,k,static_cast(expected(i,j,k)), - i,j,k,static_cast(actual(i,j,k)), + i,j,k,static_cast(h_expected(i,j,k)), + i,j,k,static_cast(h_actual(i,j,k)), diff, epsilon); FATAL_ERROR("Comparison failure!"); @@ -1403,17 +1405,39 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { using scalar_type = typename dstViewType::value_type; - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); if (options.blas_args.batch_size_last_dim) { - exit(255); // TODO + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + size_t remainder = dst.extent(2) % simd_vector_size; + remainder = remainder == 0 ? simd_internal_vector_size : remainder; + + // The below loops copies each corresponding 2-rank matrix within the simd view back to the + // 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { + auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { + auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + } + } + } + } + } } else { + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; + if (remainder > 0) { // The below loops copies each corresponding 2-rank matrix within the simd view back to the // 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { From 891f4bd178b3c90a94d52eb10b05ccd8611d4454 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:54:56 -0700 Subject: [PATCH 096/126] test_common: Fix half_t epsilon --- test_common/KokkosKernels_TestUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 1d383ffd35..ad546fe0b4 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -219,7 +219,7 @@ namespace Test { template<> class epsilon { public: - constexpr static double value = 0009765625F; + constexpr static double value = 0.0009765625F; }; #endif // KOKKOS_HALF_T_IS_FLOAT } From a7558b5eaccfdbcb27de174e86ef1048a3d2f531 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 14:06:36 -0700 Subject: [PATCH 097/126] perf_test/blas/blas3: Update serial loops - Update serial loops for batch_size_last_dim option - Remove dead code --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 97 ++++++------------- 1 file changed, 30 insertions(+), 67 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f560690e54..4ee8a676dd 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -308,22 +308,28 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } - // TODO: Debug this when starting a matrix sizes <= 10x10 - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, + A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -341,22 +347,29 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } - SerialGemm::invoke( - _gemm_args.alpha, A, B, _gemm_args.beta, C); + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); #else @@ -400,56 +413,6 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) -template -struct parallel_blas_gemm { - gemm_args_t gemm_args_; - - parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { - auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); - - KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, - svA, svB, gemm_args_.beta, svC); - } -}; -#endif // !KOKKOS_ENABLE_CUDA - -template -void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - Kokkos::Timer timer; - using execution_space = typename device_type::execution_space; - using functor_type = parallel_blas_gemm; - functor_type parallel_blas_gemm_functor(gemm_args); - - STATUS; - - Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_gemm_functor); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopGemm", - Kokkos::RangePolicy(0, n), - parallel_blas_gemm_functor); - Kokkos::fence(); - __gemm_output_csv_row(options, gemm_args, timer.seconds()); -#else - std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; - __gemm_output_csv_row(options, gemm_args, -1); -#endif // !KOKKOS_ENABLE_CUDA - return; -} - template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; From 4ea0e4c863d3eab7de25c42ea6a90b05b19f4492 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 15:46:18 -0700 Subject: [PATCH 098/126] test_common: Update VanillaGemm - Fix VanillaGemm to work with batch_size_last_dim=true when Cuda is enabled. --- test_common/KokkosKernels_TestUtils.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index ad546fe0b4..43f2d48460 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -167,24 +167,29 @@ namespace Test { using ScalarA = typename ViewTypeA::value_type; using ScalarB = typename ViewTypeB::value_type; using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + ScalarA alpha; ScalarC beta; KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; - auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); if (batch_size_last_dim) { _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); } - using SubviewTypeA = decltype(_A); - using SubviewTypeB = decltype(_B); - using SubviewTypeC = decltype(_C); struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; From fc1996115f35d723a1cb6920e431ced87978d020 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 10 Mar 2021 10:05:07 -0700 Subject: [PATCH 099/126] Small but important fixes and testing coverage: - When doing MIS2 coarsening with sort+merge, actually return the sorted+merged graph instead of the raw coarsened one. - Replace std:cout with std::cout (in verbose path of D1/D2 coloring) - Set numClusters to 0 in MIS2 coarsening when graph has 0 vertices - Test MIS-2 based coarsening (fine graph -> labels) for zero row case - Test explicit coarsening (fine graph, labels -> coarse graph), was not tested before - Test with zero row case (this PR has a fix for that) - Test with and without compression (this PR also fixes that) - The test graph actually does exercise compression e.g. the uncompressed graph actually has duplicate entries per row. --- .../KokkosGraph_Distance1ColorHandle.hpp | 2 +- .../KokkosGraph_Distance2ColorHandle.hpp | 4 +- src/graph/KokkosGraph_ExplicitCoarsening.hpp | 4 ++ src/graph/KokkosGraph_MIS2.hpp | 1 + unit_test/graph/Test_Graph_mis2.hpp | 67 +++++++++++++++++++ 5 files changed, 75 insertions(+), 3 deletions(-) diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 077104ef9f..fc7f40bf1a 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -240,7 +240,7 @@ class GraphColoringHandle { this->coloring_algorithm_type = COLORING_SERIAL; #ifdef VERBOSE - std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; #endif } else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 39d66b744f..4dc7dd7fe7 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -209,14 +209,14 @@ class GraphColorDistance2Handle { this->coloring_algorithm_type = COLORING_D2_SERIAL; #ifdef VERBOSE - std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; #ifdef VERBOSE - std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } } diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp index 212cb7c383..def892a167 100644 --- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -80,6 +80,8 @@ void graph_explicit_coarsen( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } @@ -109,6 +111,8 @@ void graph_explicit_coarsen_with_inverse_map( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index c578a97271..b3098870c5 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -94,6 +94,7 @@ graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename co if(rowmap.extent(0) <= 1) { //there are no vertices to label + numClusters = 0; return labels_t(); } labels_t mis2 = graph_d2_mis(rowmap, colinds, algo); diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index 30d32fb2dc..4080a17f80 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -47,6 +47,7 @@ #include #include "KokkosGraph_MIS2.hpp" +#include "KokkosGraph_ExplicitCoarsening.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils.hpp" @@ -194,9 +195,73 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t //Check that every label is in the range [0, numClusters) for(lno_t i = 0; i < numVerts; i++) EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + //Test explicit coarsening given the labels, with and without compressing the result + rowmap_t coarseRowmapNC, coarseRowmapC; + entries_t coarseEntriesNC, coarseEntriesC; + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false); + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true); + EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1); + EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1); + //Check that coarse graph doesn't have more edges than fine graph + EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0)); + EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0)); + //Verify compression is working. + auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC); + auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC); + auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); + auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC); + for(lno_t i = 0; i < numClusters; i++) + { + //std::set maintains uniqueness as well as ascending order of elements. + //So it should exactly match the entries in the compressed version. + std::set uniqueEntries; + for(size_type j = hostRowmapNC(i); j < hostRowmapNC(i + 1); j++) + { + uniqueEntries.insert(hostEntriesNC(j)); + } + size_type compressedRowLen = hostRowmapC(i + 1) - hostRowmapC(i); + ASSERT_EQ(uniqueEntries.size(), compressedRowLen); + auto it = uniqueEntries.begin(); + for(size_type j = hostRowmapC(i); j < hostRowmapC(i + 1); j++) + { + EXPECT_EQ(*it, hostEntriesC(j)); + it++; + } + } } } +template +void test_mis2_coarsening_zero_rows() +{ + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + rowmap_t fineRowmap; + entries_t fineEntries; + //note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers the zero-row case for MIS2 alone. + lno_t numClusters; + auto labels = graph_mis2_coarsen(fineRowmap, fineEntries, numClusters, KokkosGraph::MIS2_FAST); + EXPECT_EQ(numClusters, 0); + EXPECT_EQ(labels.extent(0), 0); + //coarsen, should also produce a graph with 0 rows/entries + rowmap_t coarseRowmap; + entries_t coarseEntries; + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ @@ -206,9 +271,11 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t } \ TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ + test_mis2_coarsening(5000, 5000 * 200, 2000, 10); \ test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ test_mis2_coarsening(50, 50 * 10, 40, 10); \ test_mis2_coarsening(5, 5 * 3, 5, 0); \ + test_mis2_coarsening_zero_rows(); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) From 8522c914cbe58b14927903c690daa0904206df92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Mar 2021 11:55:01 -0700 Subject: [PATCH 100/126] perf_test/blas/blas3: Updates for half_t src/batched: Allow compile with half_t --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 58 +++++++++++++------ .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 24 +++++--- src/batched/KokkosBatched_Util.hpp | 3 +- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 4ee8a676dd..ffb13819b6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -266,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, gflops = flops / 1e9; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," - << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << ts << "," << vlen + << static_cast(options.blas_args.gemm.alpha) << "," + << static_cast(options.blas_args.gemm.beta) << "," << ts << "," << vlen << "," << loop_e_str[options.loop] << "," << __gemm_output_dim_string(options, gemm_args.dims.a) << "," << __gemm_output_dim_string(options, gemm_args.dims.b) << "," @@ -1315,7 +1315,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type STATUS; typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); - auto diff = static_cast(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k))); + auto diff = static_cast(Kokkos::Experimental::fabs(static_cast(h_expected(i,j,k) - h_actual(i,j,k)))); if (diff > epsilon) { printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", @@ -1367,10 +1367,11 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { - using scalar_type = typename dstViewType::value_type; + using dst_scalar_type = typename dstViewType::value_type; + using src_scalar_type = typename view_type_5d::value_type; if (options.blas_args.batch_size_last_dim) { - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(2) % simd_vector_size; remainder = remainder == 0 ? simd_internal_vector_size : remainder; @@ -1392,7 +1393,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } } } else { - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; @@ -1416,7 +1417,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } else { // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location // and the data can simply be copied. - memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); } } } @@ -1616,15 +1617,24 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, + using tmp_view_type_4d = Kokkos::View; + tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, + double>::max()); + tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, + double>::max()); + tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, - scalar_type>::max()); + double>::max()); + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.Av.mat_4d, tmpA); + Kokkos::deep_copy(gemm_args.Bv.mat_4d, tmpB); + Kokkos::deep_copy(gemm_args.Cv.mat_4d, tmpC); + Kokkos::fence(); } else { if (options.blas_args.batch_size_last_dim) { gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); @@ -1636,15 +1646,25 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - Kokkos::fill_random(gemm_args.A, rand_pool, + using tmp_view_type_3d = Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); + Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, + double>::max()); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); + Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, + double>::max()); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2)); + Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, - scalar_type>::max()); + double>::max()); + + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.A, tmpA); + Kokkos::deep_copy(gemm_args.B, tmpB); + Kokkos::deep_copy(gemm_args.C, tmpC); + Kokkos::fence(); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 6d67e96bd1..0a6741c603 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -175,7 +175,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," - << options.blas_args.trmm.alpha << "," + << static_cast(options.blas_args.trmm.alpha) << "," << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) @@ -624,10 +624,14 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - Kokkos::fill_random(trmm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::deep_copy(host_A, trmm_args.A); + + { + Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(host_A, tmp); + } if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular @@ -663,9 +667,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } Kokkos::deep_copy(trmm_args.A, host_A); - Kokkos::fill_random(trmm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); + { + Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(trmm_args.B, tmp); + } return trmm_args; } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 3253b6ce12..4a5c17d1df 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -204,7 +204,8 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same >::value, + std::is_same >::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; }; From 4f9dafa854357505c1243cebea6afcec24405dff Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Mar 2021 13:16:36 -0700 Subject: [PATCH 101/126] perf_test/blas: Apply clang-format --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 263 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 14 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 21 +- src/batched/KokkosBatched_Util.hpp | 2 +- 4 files changed, 188 insertions(+), 112 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index ffb13819b6..081b01bb58 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,7 +56,7 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" -#include "gtest/gtest.h" // EXPECT_NEAR +#include "gtest/gtest.h" // EXPECT_NEAR #include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -256,8 +256,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - if (options.verify) - return; + if (options.verify) return; flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, @@ -267,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << static_cast(options.blas_args.gemm.alpha) << "," - << static_cast(options.blas_args.gemm.beta) << "," << ts << "," << vlen - << "," << loop_e_str[options.loop] << "," + << static_cast(options.blas_args.gemm.beta) << "," + << ts << "," << vlen << "," << loop_e_str[options.loop] << "," << __gemm_output_dim_string(options, gemm_args.dims.a) << "," << __gemm_output_dim_string(options, gemm_args.dims.b) << "," << __gemm_output_dim_string(options, gemm_args.dims.c) << "," @@ -308,7 +307,8 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { for (int j = 0; j < _gemm_args.dims.c.k; j++) { auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); @@ -317,15 +317,16 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { if (batch_size_last_dim) { A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); - C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); } - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, + _gemm_args.alpha, A, B, _gemm_args.beta, C); } } }; - __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); @@ -347,7 +348,8 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { for (int j = 0; j < _gemm_args.dims.c.k; j++) { auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); @@ -356,7 +358,7 @@ void __do_gemm_serial_batched_template(options_t options, if (batch_size_last_dim) { A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); - C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); } SerialGemm::invoke( @@ -365,7 +367,8 @@ void __do_gemm_serial_batched_template(options_t options, } }; - __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); @@ -1311,18 +1314,22 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { * @var epsilon: The tolerance to use when comparing. * @return true if the comparison fails and false if the comparison succeeds. */ -static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { +static inline bool __gemm_print_compare_failure(view_type_3d expected, + view_type_3d actual, int i, + int j, int k, double epsilon) { STATUS; - typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); - typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); - auto diff = static_cast(Kokkos::Experimental::fabs(static_cast(h_expected(i,j,k) - h_actual(i,j,k)))); + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + auto diff = static_cast(Kokkos::Experimental::fabs( + static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); if (diff > epsilon) { - printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", - i,j,k,static_cast(h_expected(i,j,k)), - i,j,k,static_cast(h_actual(i,j,k)), - diff, - epsilon); + printf( + "fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i, j, k, static_cast(h_expected(i, j, k)), i, j, k, + static_cast(h_actual(i, j, k)), diff, epsilon); FATAL_ERROR("Comparison failure!"); return true; } @@ -1336,7 +1343,8 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type * @return false if expected matches actual within epsilon, otherwise true. */ template -static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { +static inline bool __gemm_do_compare(view_type_3d expected, + view_type_3d actual) { double epsilon = Test::epsilon::value * 1e3; STATUS; @@ -1354,7 +1362,7 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) if (std::is_same::value) { for (size_t k = 0; k < expected.extent(2); k++) { for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t i = 0; i < expected.extent(0); i++) { if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) return true; } @@ -1366,58 +1374,90 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) } template -static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, + dstViewType dst, + options_t options) { using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; if (options.blas_args.batch_size_last_dim) { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = + Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(2) % simd_vector_size; - remainder = remainder == 0 ? simd_internal_vector_size : remainder; - - // The below loops copies each corresponding 2-rank matrix within the simd view back to the - // 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { - auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + remainder = remainder == 0 ? simd_internal_vector_size : remainder; + + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + auto sv0 = + Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; + vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { + auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < src.ivec_4d.extent(3); + simd_batch_size_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), + simd_batch_size_idx); for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + dst(m, n, + simd_internal_vec_idx + simd_batch_size_idx + + vector_batch_idx) = sv2(m, n); } } } } } } else { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = + Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; if (remainder > 0) { - // The below loops copies each corresponding 2-rank matrix within the simd view back to the - // 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { - auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + auto sv0 = + Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < src.ivec_4d.extent(0); + simd_batch_size_idx++) { + auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; + vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), + vector_batch_idx); for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); + dst(simd_internal_vec_idx + simd_batch_size_idx + + vector_batch_idx, + m, n) = sv2(m, n); } } } } } } else { - // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location - // and the data can simply be copied. - memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + // When the batch_size is a multiple of the simd_vector_size, each 2-rank + // matrix lies in the correct location and the data can simply be copied. + memcpy(dst.data(), src.ivec_4d.data(), + sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * + dst.extent(2)); } } } @@ -1429,22 +1469,26 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie * @return false if expected matches actual within epsilon, otherwise true. */ template -static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) { - decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2)); +static inline bool __gemm_do_compare(view_type_3d expected, + gemm_simd_args_t actual, + options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), + expected.extent(1), expected.extent(2)); STATUS; // Copy the simd view to a 3d view for comparision. - // NOTE: The raw results are different when batch_size % simd_vector_size != 0. - // Also note that when batch_size % simd_vector_size != 0, the simd operation - // calculates results that we do not require. - // So, we end up running an extra batch_size % simd_vector_size GEMMs! + // NOTE: The raw results are different when batch_size % simd_vector_size != + // 0. Also note that when batch_size % simd_vector_size != 0, the simd + // operation calculates results that we do not require. So, we end up running + // an extra batch_size % simd_vector_size GEMMs! __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); return __gemm_do_compare(expected, actual_data); } template -static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, + void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; // Just create "expected" types using non-simd types. decltype(gemm_args.C) C_expected; @@ -1453,13 +1497,19 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo STATUS; if (options.blas_args.batch_size_last_dim) { - C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k); - A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k); - B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k); + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, + gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, + gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, + gemm_args.dims.b.n, gemm_args.dims.b.k); } else { - C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n); - A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n); - B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n); + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, + gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, + gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, + gemm_args.dims.b.m, gemm_args.dims.b.n); } // Initialize "expected" matrices. @@ -1468,44 +1518,50 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo Kokkos::deep_copy(A_expected, gemm_args.A); Kokkos::deep_copy(B_expected, gemm_args.B); - Kokkos::fence(); // Ensure that deep_copy has completed + Kokkos::fence(); // Ensure that deep_copy has completed // Check that initial values match if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Inital values mismatch!"); } else if (gemm_args.Cv.vec_3d.data() != nullptr) { - __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, C_expected, options); - __gemm_copy_simd_view_to_3d_view(gemm_args.Av, A_expected, options); - __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, + C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, + A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, + B_expected, options); // Check that initial values match - if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) FATAL_ERROR("Inital values mismatch!"); } else { FATAL_ERROR("Input arguments are empty!"); } // Populate "expected" matrices via VanillaGemm - Test::Functor_BatchedVanillaGEMM vgemm; + Test::Functor_BatchedVanillaGEMM + vgemm; vgemm.A_t = toupper(gemm_args.transA) == 'T'; vgemm.B_t = toupper(gemm_args.transB) == 'T'; - vgemm.A_c = vgemm.B_c = false; + vgemm.A_c = vgemm.B_c = false; vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; - vgemm.A = A_expected; - vgemm.B = B_expected; - vgemm.C = C_expected; - vgemm.alpha = gemm_args.alpha; - vgemm.beta = gemm_args.beta; - vgemm.run(); // Compute C_expected - - // Run routine with warm_up_n = 1 and n = 0. + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. auto warm_up_n_bak = options.warm_up_n; - options.warm_up_n = 1; - auto n_bak = options.n; - options.n = 0; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; fn(options, gemm_args); - Kokkos::fence(); // Redundant fence. + Kokkos::fence(); // Redundant fence. // Check the result if (gemm_args.C.data() != nullptr) { @@ -1514,14 +1570,15 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo } if (gemm_args.Cv.vec_3d.data() != nullptr) { - if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) FATAL_ERROR("Result value mismatch!"); } // Run actual timed test. - options.verify = false; // Set verify to false for csv output. + options.verify = false; // Set verify to false for csv output. options.warm_up_n = warm_up_n_bak; - options.n = n_bak; + options.n = n_bak; fn(options, gemm_args); // Reset verify for next matrix size. @@ -1617,16 +1674,23 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - using tmp_view_type_4d = Kokkos::View; - tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + using tmp_view_type_4d = + Kokkos::View; + tmp_view_type_4d tmpA( + "tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), + gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + tmp_view_type_4d tmpB( + "tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), + gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + tmp_view_type_4d tmpC( + "tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), + gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, double>::max()); @@ -1646,16 +1710,20 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - using tmp_view_type_3d = Kokkos::View; - tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); + using tmp_view_type_3d = + Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.A.extent(2)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), + gemm_args.B.extent(2)); Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2)); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), + gemm_args.C.extent(2)); Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, double>::max()); @@ -1671,7 +1739,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fence(); // Ensure that fill_random has completed. + Kokkos::fence(); // Ensure that fill_random has completed. return gemm_args; } @@ -1702,7 +1770,8 @@ void __do_loop_and_invoke(options_t options, view_type_3d, default_device>(options, cur_dims); if (options.verify) { - __gemm_do_verify(options, gemm_args, fn); + __gemm_do_verify( + options, gemm_args, fn); } else { fn(options, gemm_args); } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 7e1cdf0f2f..149cc00fd1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -128,7 +128,8 @@ static void __print_help_blas3_perf_test() { "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " "(Heirarchical parallelism).\n"); printf( - "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size " + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use " + "--vector_len and --team_size " "instead. (default: %d)\n", DEFAULT_USE_AUTO); @@ -139,7 +140,8 @@ static void __print_help_blas3_perf_test() { printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); printf("\t\tHow to allocate the batch_size in the matrices.\n"); printf( - "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size " + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last " + "dimension and 0 to make the batch_size " "the first dimension (default: %d)\n", DEFAULT_BATCH_SIZE_LAST_DIM); @@ -212,7 +214,8 @@ static void __print_help_blas3_perf_test() { printf("\t-v, --verify=VERIFY\n"); printf("\t\tVerification selection. (untimed)\n"); printf( - "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. " + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to " + "verify before timing. " "(default: %d)\n", DEFAULT_VERIFY); } @@ -275,8 +278,9 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", - long_options, &option_idx)) != -1) { + while ( + (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 0a6741c603..de2bbd9ce9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -106,8 +106,8 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m, - double a_n) { +static inline double __trmm_flop_count(char side, double b_m, double b_n, + double a_m, double a_n) { double flops; if (side == 'L' || side == 'l') { @@ -624,12 +624,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - { - Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), + trmm_args.A.extent(2)); Kokkos::fill_random(tmp, rand_pool, - Kokkos::rand, - double>::max()); + Kokkos::rand, + double>::max()); Kokkos::deep_copy(host_A, tmp); } @@ -668,10 +669,12 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::deep_copy(trmm_args.A, host_A); { - Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), + trmm_args.B.extent(2)); Kokkos::fill_random(tmp, rand_pool, - Kokkos::rand, - double>::max()); + Kokkos::rand, + double>::max()); Kokkos::deep_copy(trmm_args.B, tmp); } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 4a5c17d1df..eb9883c425 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -205,7 +205,7 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same::value, + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; }; From 3dd5c3bd8f2e3a406350c27dde8ecd4e5f847ec0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 Mar 2021 16:45:11 -0600 Subject: [PATCH 102/126] perf_test/blas/blas3: Handle LayoutRight SIMD view - For GEMM, copy out LayoutRight SIMD sub-batches properly for verification. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 155 +++++++++--------- 1 file changed, 80 insertions(+), 75 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 081b01bb58..cc68dcf43c 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -214,7 +214,7 @@ struct gemm_args { typedef struct gemm_args gemm_args_t; static std::string gemm_csv_header_str = - "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" + "algorithm,vector_type,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" "dims,C_dims,warm_up_n," "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; @@ -249,6 +249,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, std::string algo_name = test_e_str[options.test]; std::string ts = std::to_string(gemm_args.bp.team_size); std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string vtype = internal_vector_type::label(); if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; @@ -264,7 +265,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, gflops = flops / 1e9; - options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," + options.out[0] << algo_name << "," << vtype << "," << options.blas_args.gemm.gemm_args << "," << static_cast(options.blas_args.gemm.alpha) << "," << static_cast(options.blas_args.gemm.beta) << "," << ts << "," << vlen << "," << loop_e_str[options.loop] << "," @@ -1379,85 +1380,89 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, options_t options) { using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; + size_t remainder, vector_batch_size, simd_batch_size; + bool data_layout_same_as_3d_view = false; if (options.blas_args.batch_size_last_dim) { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), - simd_internal_vector_size, src.ivec_4d.extent(0), - src.ivec_4d.extent(1), src.ivec_4d.extent(2), - src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = - Kokkos::create_mirror_view(src_raw); - size_t remainder = dst.extent(2) % simd_vector_size; - remainder = remainder == 0 ? simd_internal_vector_size : remainder; - - // The below loops copies each corresponding 2-rank matrix within the simd - // view back to the 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; - simd_internal_vec_idx++) { - auto sv0 = - Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; - vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { - auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; - simd_batch_size_idx < src.ivec_4d.extent(3); - simd_batch_size_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), - simd_batch_size_idx); - for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { - for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(m, n, - simd_internal_vec_idx + simd_batch_size_idx + - vector_batch_idx) = sv2(m, n); - } - } - } - } - } + remainder = dst.extent(2) % simd_vector_size; + vector_batch_size = src.ivec_4d.extent(0); + simd_batch_size = src.ivec_4d.extent(3); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + } else { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), - simd_internal_vector_size, src.ivec_4d.extent(0), - src.ivec_4d.extent(1), src.ivec_4d.extent(2), - src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = - Kokkos::create_mirror_view(src_raw); - size_t remainder = dst.extent(0) % simd_vector_size; - - if (remainder > 0) { - // The below loops copies each corresponding 2-rank matrix within the simd - // view back to the 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; - simd_internal_vec_idx++) { - auto sv0 = - Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; - simd_batch_size_idx < src.ivec_4d.extent(0); - simd_batch_size_idx++) { - auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; - vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), - vector_batch_idx); - for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { - for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(simd_internal_vec_idx + simd_batch_size_idx + - vector_batch_idx, - m, n) = sv2(m, n); - } - } + remainder = dst.extent(0) % simd_vector_size; + vector_batch_size = src.ivec_4d.extent(3); + simd_batch_size = src.ivec_4d.extent(0); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + } + + // When the batch_size is a multiple of the simd_vector_size and the batch_size + // dimension is nearest to the simd_vector_size dimension, each 2-rank matrix + // lies in the correct location and the data can simply be cast to the 3d view. + if (data_layout_same_as_3d_view) { + // We can just re-cast the data to the 3d view but we'll copy it for verification + memcpy(dst.data(), src.ivec_4d.data(), + sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * + dst.extent(2)); + return; + } + + // If the remainder is 0, we have simd_vector_size sub-batches to copy out... + // this is a bad data access pattern but for these perf_tests we will support it. + remainder = remainder == 0 ? simd_vector_size : remainder; + + // Views needed for slow manual copy + view_type_5d src_raw; + using subview_type_2d = Kokkos::View; + using subview_type_3d = Kokkos::View; + using subview_type_4d = Kokkos::View; + subview_type_4d sv0; + subview_type_3d sv1; + subview_type_2d sv2; + + if (std::is_same::value) + src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); + else + src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = + Kokkos::create_mirror_view(src_raw); + + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + if (std::is_same::value) + sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); + else + sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + for (size_t vector_batch_idx = 0; + vector_batch_idx < vector_batch_size; vector_batch_idx++) { + if (options.blas_args.batch_size_last_dim) + sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + else + sv1 = Kokkos::subview(sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < simd_batch_size; + simd_batch_size_idx++) { + if (options.blas_args.batch_size_last_dim) + sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + else + sv2 = Kokkos::subview(sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL()); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + if (options.blas_args.batch_size_last_dim) + dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + else + dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); } } } - } else { - // When the batch_size is a multiple of the simd_vector_size, each 2-rank - // matrix lies in the correct location and the data can simply be copied. - memcpy(dst.data(), src.ivec_4d.data(), - sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * - dst.extent(2)); } } } From 64d4a15d56312e0ccb1222e9dce8d8ccbdeab4d8 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Mar 2021 08:24:33 -0600 Subject: [PATCH 103/126] src/batched: Add missing label member --- src/batched/KokkosBatched_Vector_SIMD.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp index a950e5e41f..d7d3d58080 100644 --- a/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -702,6 +702,9 @@ namespace KokkosBatched { enum : int { vector_length = 8 }; typedef __m512d data_type __attribute__ ((aligned(64))); + inline + static const char* label() { return "AVX512"; } + template friend class Vector; From dd0432c8eb9d402af1419b418433a95871cc44bb Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Mar 2021 08:30:25 -0600 Subject: [PATCH 104/126] perf_test/blas/blas3: - Fix memory space on GEMM subview types. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index cc68dcf43c..20e792d9ae 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1415,12 +1415,12 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, // Views needed for slow manual copy view_type_5d src_raw; - using subview_type_2d = Kokkos::View; - using subview_type_3d = Kokkos::View; - using subview_type_4d = Kokkos::View; - subview_type_4d sv0; - subview_type_3d sv1; - subview_type_2d sv2; + using h_subview_type_2d = Kokkos::View; + using h_subview_type_3d = Kokkos::View; + using h_subview_type_4d = Kokkos::View; + h_subview_type_4d h_sv0; + h_subview_type_3d h_sv1; + h_subview_type_2d h_sv2; if (std::is_same::value) src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); @@ -1437,29 +1437,29 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { if (std::is_same::value) - sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); + h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); else - sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + h_sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (size_t vector_batch_idx = 0; vector_batch_idx < vector_batch_size; vector_batch_idx++) { if (options.blas_args.batch_size_last_dim) - sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + h_sv1 = Kokkos::subview(h_sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); else - sv1 = Kokkos::subview(sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + h_sv1 = Kokkos::subview(h_sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < simd_batch_size; simd_batch_size_idx++) { if (options.blas_args.batch_size_last_dim) - sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + h_sv2 = Kokkos::subview(h_sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); else - sv2 = Kokkos::subview(sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL()); + h_sv2 = Kokkos::subview(h_sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL()); for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { if (options.blas_args.batch_size_last_dim) - dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); else - dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); + dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); } } } From cbd421f660df3aff5b190b4d7d47fb98c2d02573 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 18 Mar 2021 12:01:48 -0600 Subject: [PATCH 105/126] perf_test/blas/blas3: Fix device verify - The verification option for device is not correct. It will always compare 0s from the HostMirror views since the data was never copied to the host. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 76 ++++++++++++------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 20e792d9ae..b9556d1c46 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1315,14 +1315,11 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { * @var epsilon: The tolerance to use when comparing. * @return true if the comparison fails and false if the comparison succeeds. */ -static inline bool __gemm_print_compare_failure(view_type_3d expected, - view_type_3d actual, int i, +template +static inline bool __gemm_print_compare_failure(ViewType h_expected, + ViewType h_actual, int i, int j, int k, double epsilon) { STATUS; - typename view_type_3d::HostMirror h_expected = - Kokkos::create_mirror_view(expected); - typename view_type_3d::HostMirror h_actual = - Kokkos::create_mirror_view(actual); auto diff = static_cast(Kokkos::Experimental::fabs( static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); @@ -1349,11 +1346,21 @@ static inline bool __gemm_do_compare(view_type_3d expected, double epsilon = Test::epsilon::value * 1e3; STATUS; + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + + // Copy to host for comparision + Kokkos::deep_copy(h_expected, expected); + Kokkos::deep_copy(h_actual, actual); + Kokkos::fence(); + if (std::is_same::value) { - for (size_t i = 0; i < expected.extent(0); i++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t k = 0; k < expected.extent(2); k++) { - if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + for (size_t i = 0; i < h_expected.extent(0); i++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) return true; } } @@ -1361,10 +1368,10 @@ static inline bool __gemm_do_compare(view_type_3d expected, } if (std::is_same::value) { - for (size_t k = 0; k < expected.extent(2); k++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t i = 0; i < expected.extent(0); i++) { - if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + for (size_t k = 0; k < h_expected.extent(2); k++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) return true; } } @@ -1380,20 +1387,28 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, options_t options) { using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; - size_t remainder, vector_batch_size, simd_batch_size; + size_t remainder, vector_batch_size, simd_batch_size, last_batch; bool data_layout_same_as_3d_view = false; + typename dstViewType::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + typename view_type_4d::HostMirror h_src = + Kokkos::create_mirror_view(src.mat_4d); + Kokkos::deep_copy(h_src, src.mat_4d); + Kokkos::fence(); if (options.blas_args.batch_size_last_dim) { - remainder = dst.extent(2) % simd_vector_size; + remainder = dst.extent(2) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(0); simd_batch_size = src.ivec_4d.extent(3); + last_batch = dst.extent(2); if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; } else { - remainder = dst.extent(0) % simd_vector_size; + remainder = dst.extent(0) % simd_internal_vector_size; vector_batch_size = src.ivec_4d.extent(3); simd_batch_size = src.ivec_4d.extent(0); + last_batch = dst.extent(0); if (std::is_same::value && remainder == 0) data_layout_same_as_3d_view = true; } @@ -1403,34 +1418,38 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, // lies in the correct location and the data can simply be cast to the 3d view. if (data_layout_same_as_3d_view) { // We can just re-cast the data to the 3d view but we'll copy it for verification - memcpy(dst.data(), src.ivec_4d.data(), + memcpy(h_dst.data(), h_src.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); return; } // If the remainder is 0, we have simd_vector_size sub-batches to copy out... // this is a bad data access pattern but for these perf_tests we will support it. - remainder = remainder == 0 ? simd_vector_size : remainder; + // If the remainder is non-zero, we have simd_vector_size sub-batches + remainder to + // copy out. + remainder += simd_internal_vector_size; // Views needed for slow manual copy - view_type_5d src_raw; + using h_view_type_5d = Kokkos::View; using h_subview_type_2d = Kokkos::View; using h_subview_type_3d = Kokkos::View; using h_subview_type_4d = Kokkos::View; + h_view_type_5d h_src_raw; h_subview_type_4d h_sv0; h_subview_type_3d h_sv1; h_subview_type_2d h_sv2; + // TODO: Clean everything below this point up... if (std::is_same::value) - src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); else - src_raw = view_type_5d((src_scalar_type *)src.ivec_4d.data(), + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = - Kokkos::create_mirror_view(src_raw); // The below loops copies each corresponding 2-rank matrix within the simd // view back to the 3-rank view. @@ -1457,14 +1476,19 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { if (options.blas_args.batch_size_last_dim) - dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); + h_dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); else - dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); + h_dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); } } + if (simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx == last_batch - 1) + goto out; } } } +out: + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); } /** From 5f45262952653b3dc1c8b59c16e55df1b2ce69a2 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Thu, 18 Mar 2021 14:58:07 -0600 Subject: [PATCH 106/126] Fixed nrm1 (#914), improved blas tests - Made nrm1 compute the sum of all absolute real and imaginary parts to match BLAS/MKL/CUBLAS behavior, rather than sum of magnitudes. - Improved unit test coverage - verify each output element, not just dotprod of output with itself - for complex, create randomized inputs with nonzero imaginary parts - enable conj-trans mode testing for gemv --- src/blas/impl/KokkosBlas1_nrm1_impl.hpp | 40 ++-- .../KokkosBlas1_nrminf_tpl_spec_avail.hpp | 19 -- .../tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp | 178 +----------------- test_common/KokkosKernels_TestUtils.hpp | 31 ++- unit_test/blas/Test_Blas1_abs.hpp | 88 ++++----- unit_test/blas/Test_Blas1_asum.hpp | 15 +- unit_test/blas/Test_Blas1_axpby.hpp | 86 +++++---- unit_test/blas/Test_Blas1_axpy.hpp | 86 +++++---- unit_test/blas/Test_Blas1_dot.hpp | 28 ++- unit_test/blas/Test_Blas1_iamax.hpp | 12 +- unit_test/blas/Test_Blas1_mult.hpp | 103 ++++++---- unit_test/blas/Test_Blas1_nrm1.hpp | 76 ++++---- unit_test/blas/Test_Blas1_nrm2.hpp | 12 +- unit_test/blas/Test_Blas1_nrm2_squared.hpp | 12 +- unit_test/blas/Test_Blas1_nrminf.hpp | 15 +- unit_test/blas/Test_Blas1_reciprocal.hpp | 28 ++- unit_test/blas/Test_Blas1_scal.hpp | 119 ++++++------ unit_test/blas/Test_Blas1_sum.hpp | 15 +- unit_test/blas/Test_Blas1_team_dot.hpp | 4 - unit_test/blas/Test_Blas1_team_nrm2.hpp | 2 - unit_test/blas/Test_Blas1_team_scal.hpp | 4 - unit_test/blas/Test_Blas1_team_update.hpp | 4 - unit_test/blas/Test_Blas1_update.hpp | 103 ++++++---- unit_test/blas/Test_Blas2_gemv.hpp | 67 ++++--- unit_test/blas/Test_Blas2_team_gemv.hpp | 2 - unit_test/blas/Test_Blas3_gemm.hpp | 4 - unit_test/blas/Test_Blas3_trmm.hpp | 3 - unit_test/blas/Test_Blas3_trsm.hpp | 2 - 28 files changed, 556 insertions(+), 602 deletions(-) diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 9e1393eb5a..296c424b3c 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -52,10 +52,10 @@ namespace KokkosBlas { namespace Impl { // -// nrm1_squared +// nrm1 // -/// \brief 2-norm (squared) functor for single vectors. +/// \brief 1-norm functor for single vectors. /// /// \tparam RV 0-D output View /// \tparam XV 1-D input View @@ -63,12 +63,12 @@ namespace Impl { template struct V_Nrm1_Functor { - typedef typename XV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type; + typedef typename XV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef typename XAT::mag_type value_type; + typedef Kokkos::ArithTraits MAT; typename XV::const_type m_x; @@ -94,12 +94,13 @@ struct V_Nrm1_Functor KOKKOS_INLINE_FUNCTION void operator() (const size_type& i, value_type& sum) const { - sum += IPT::norm (m_x(i)); + xvalue_type val = m_x(i); + sum += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } KOKKOS_INLINE_FUNCTION void init (value_type& update) const { - update = AT::zero (); + update = MAT::zero (); } KOKKOS_INLINE_FUNCTION void @@ -117,7 +118,7 @@ struct V_Nrm1_Functor } }; -/// \brief Column-wise 2-norm functor for multivectors; works for +/// \brief Column-wise 1-norm functor for multivectors; works for /// any layout, but best performance with LayoutRight. /// /// \tparam RV 1-D output View @@ -126,12 +127,12 @@ struct V_Nrm1_Functor template struct MV_Nrm1_Right_FunctorVector { - typedef typename XMV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XMV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type[]; + typedef typename XMV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XMV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef Kokkos::ArithTraits MAT; + typedef typename XAT::mag_type value_type[]; size_type value_count; typename XMV::const_type m_x; @@ -166,7 +167,8 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - sum[j] += IPT::norm (m_x(i,j)); + xvalue_type val = m_x(i, j); + sum[j] += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } } @@ -181,7 +183,7 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - update[j] = AT::zero (); + update[j] = MAT::zero (); } } diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 16d22e7b02..072abff904 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -77,25 +77,6 @@ KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutL #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \ -template \ -struct nrminf_tpl_spec_avail< \ -Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1> { enum : bool { value = true }; }; - -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) - -#endif - } } #endif diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 5f7a102e77..b91e81891a 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -83,6 +83,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -94,7 +95,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -116,6 +117,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -127,7 +129,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -220,176 +222,4 @@ KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, f #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#include - -namespace KokkosBlas { -namespace Impl { - -#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIdamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f);; return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIsamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIzamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0, 0.0}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIcamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0f, 0.0f}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -} -} - -#endif - #endif diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 43f2d48460..f3a34ba123 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -46,6 +46,8 @@ #define KOKKOSKERNELS_TEST_UTILS_HPP #include "KokkosKernels_Utils.hpp" +#include "Kokkos_ArithTraits.hpp" + namespace Test { template::value> struct multivector_layout_adapter; @@ -83,16 +85,15 @@ namespace Test { template void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol) { typedef Kokkos::Details::ArithTraits AT1; - typedef Kokkos::Details::ArithTraits AT2; typedef Kokkos::Details::ArithTraits AT3; - EXPECT_NEAR(double(AT1::abs(val1)),double(AT2::abs(val2)),double(AT3::abs(tol))); + EXPECT_LE((double) AT1::abs(val1 - val2), (double) AT3::abs(tol)); } template void EXPECT_NEAR_KK_1DVIEW(ViewType1 v1, ViewType2 v2, Scalar tol) { size_t v1_size = v1.extent(0); size_t v2_size = v2.extent(0); - EXPECT_NEAR_KK(v1_size, v2_size, 0); + EXPECT_EQ(v1_size, v2_size); typename ViewType1::HostMirror h_v1 = Kokkos::create_mirror_view(v1); @@ -227,5 +228,29 @@ namespace Test { constexpr static double value = 0.0009765625F; }; #endif // KOKKOS_HALF_T_IS_FLOAT + + //Get the interval for Kokkos::fill_random + //For real, interval is (-mag, mag) + //For complex, both real and imaginary parts will have interval (-mag, mag) + template + inline void getRandomBounds(double mag, Scalar& start, Scalar& end) + { + start = -mag * Kokkos::ArithTraits::one(); + end = mag * Kokkos::ArithTraits::one(); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } } #endif diff --git a/unit_test/blas/Test_Blas1_abs.hpp b/unit_test/blas/Test_Blas1_abs.hpp index acdb167d1d..d1cb36d368 100644 --- a/unit_test/blas/Test_Blas1_abs.hpp +++ b/unit_test/blas/Test_Blas1_abs.hpp @@ -2,7 +2,6 @@ #include #include #include -#include #include namespace Test { @@ -23,7 +22,7 @@ namespace Test { Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; - double eps = std::is_same::value?2*1e-5:1e-7; + typename AT::mag_type eps = AT::epsilon()*10; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -42,29 +41,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); + typename AT::mag_type eps = AT::epsilon()*10; + //Test and verify non-const input KokkosBlas::abs(y,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k AT; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; @@ -36,7 +39,13 @@ namespace Test { typename AT::mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } typename AT::mag_type nonconst_result = KokkosBlas::asum(a); EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result); diff --git a/unit_test/blas/Test_Blas1_axpby.hpp b/unit_test/blas/Test_Blas1_axpby.hpp index f2bc692d09..84943b1bc7 100644 --- a/unit_test/blas/Test_Blas1_axpby.hpp +++ b/unit_test/blas/Test_Blas1_axpby.hpp @@ -31,6 +31,7 @@ namespace Test { BaseTypeB b_org_y("Org_Y",N); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); typename ViewTypeA::const_type c_x = x; @@ -44,26 +45,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + Kokkos::deep_copy(h_b_org_y, b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - - ScalarA expected_result = 0; - for(int i=0;i @@ -93,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -105,36 +127,32 @@ namespace Test { ScalarB b = 5; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; Kokkos::View r("Dot::Result",K); - typedef Kokkos::Details::ArithTraits AT; - KokkosBlas::axpby(a,x,b,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; + using MagnitudeA = typename Kokkos::ArithTraits::mag_type; ScalarA a = 3; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?2e-5:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -43,26 +43,40 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result = 0; + KokkosBlas::axpy(a,x,y); + Kokkos::deep_copy(h_b_y, b_y); + for(int i=0;i @@ -92,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -103,33 +126,28 @@ namespace Test { ScalarA a = 3; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::axpy(a,x,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -92,10 +98,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp index 166c25c1a8..5e98912553 100644 --- a/unit_test/blas/Test_Blas1_iamax.hpp +++ b/unit_test/blas/Test_Blas1_iamax.hpp @@ -29,9 +29,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -115,9 +115,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_mult.hpp b/unit_test/blas/Test_Blas1_mult.hpp index fcab767dcc..1f6856a934 100644 --- a/unit_test/blas/Test_Blas1_mult.hpp +++ b/unit_test/blas/Test_Blas1_mult.hpp @@ -29,7 +29,7 @@ namespace Test { ScalarA a = 3; ScalarB b = 5; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?1e-4:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -53,33 +53,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -118,11 +137,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -133,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + double eps = std::is_same::value?1e-4:1e-7; KokkosBlas::mult(b,z,a,x,y); - KokkosBlas::dot(r,z,z); - for(int k=0;k AT; + typedef Kokkos::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); - typename AT::mag_type expected_result = 0; + mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } - typename AT::mag_type const_result = KokkosBlas::nrm1(c_a); - EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result); + mag_type nonconst_result = KokkosBlas::nrm1(a); + EXPECT_NEAR_KK( nonconst_result, expected_result, eps * expected_result ); + mag_type const_result = KokkosBlas::nrm1(c_a); + EXPECT_NEAR_KK( const_result, expected_result, eps * expected_result ); } template @@ -53,6 +61,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::Details::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef multivector_layout_adapter vfA_type; @@ -68,38 +78,36 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - typename AT::mag_type* expected_result = new typename AT::mag_type[K]; - for(int j=0;j::mag_type, float>::value ? 1e-4 : 1e-7); + + Kokkos::View expected_result("Expected Nrm1", K); + for(int k = 0; k < K; k++) + { + expected_result(k) = MAT::zero(); for(int i=0;i::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + Kokkos::View r("Nrm1::Result",K); + Kokkos::View c_r("Nrm1::ConstResult",K); - KokkosBlas::nrm1(r,a); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -69,9 +69,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ac116b8987..aef2e2e95e 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -68,9 +68,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_nrminf.hpp b/unit_test/blas/Test_Blas1_nrminf.hpp index f328a720b7..0893045dee 100644 --- a/unit_test/blas/Test_Blas1_nrminf.hpp +++ b/unit_test/blas/Test_Blas1_nrminf.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -70,9 +70,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -98,13 +98,12 @@ namespace Test { EXPECT_NEAR_KK( nonconst_result, exp_result, eps*exp_result); } - /* KokkosBlas::nrminf(r,c_a); + KokkosBlas::nrminf(r,c_a); for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); @@ -99,10 +105,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); diff --git a/unit_test/blas/Test_Blas1_scal.hpp b/unit_test/blas/Test_Blas1_scal.hpp index f59b8d49ea..254850f1ae 100644 --- a/unit_test/blas/Test_Blas1_scal.hpp +++ b/unit_test/blas/Test_Blas1_scal.hpp @@ -25,13 +25,10 @@ namespace Test { ScalarA a(3); typename AT::mag_type eps = AT::epsilon()*1000; - typename AT::mag_type zero = AT::abs( AT::zero() ); - typename AT::mag_type one = AT::abs( AT::one() ); BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -46,35 +43,35 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::fence(); Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); ScalarA a(3.0); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); KokkosBlas::scal(y,a,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k params("Params",K); for(int j=0; j rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -51,7 +51,6 @@ namespace Test { void impl_test_sum_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -67,9 +66,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -79,7 +78,7 @@ namespace Test { for(int j=0;j::value?2*1e-5:1e-7; diff --git a/unit_test/blas/Test_Blas1_team_dot.hpp b/unit_test/blas/Test_Blas1_team_dot.hpp index 158dcf5733..f3c819da3b 100644 --- a/unit_test/blas/Test_Blas1_team_dot.hpp +++ b/unit_test/blas/Test_Blas1_team_dot.hpp @@ -46,8 +46,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -150,8 +148,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/unit_test/blas/Test_Blas1_team_nrm2.hpp b/unit_test/blas/Test_Blas1_team_nrm2.hpp index 4c654c7eae..99147053ed 100644 --- a/unit_test/blas/Test_Blas1_team_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_team_nrm2.hpp @@ -33,8 +33,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; diff --git a/unit_test/blas/Test_Blas1_team_scal.hpp b/unit_test/blas/Test_Blas1_team_scal.hpp index 6b33caa262..fb6ef4487d 100644 --- a/unit_test/blas/Test_Blas1_team_scal.hpp +++ b/unit_test/blas/Test_Blas1_team_scal.hpp @@ -57,8 +57,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); @@ -132,8 +130,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas1_team_update.hpp b/unit_test/blas/Test_Blas1_team_update.hpp index dcc9d1e486..5298a6798d 100644 --- a/unit_test/blas/Test_Blas1_team_update.hpp +++ b/unit_test/blas/Test_Blas1_team_update.hpp @@ -66,8 +66,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); @@ -149,8 +147,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas1_update.hpp b/unit_test/blas/Test_Blas1_update.hpp index 8bfcdbe5cc..0ece3ae74c 100644 --- a/unit_test/blas/Test_Blas1_update.hpp +++ b/unit_test/blas/Test_Blas1_update.hpp @@ -54,35 +54,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); + auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -119,13 +136,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -137,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::update(a,x,b,y,c,z); - KokkosBlas::dot(r,z,z); - for(int k=0;k::value ? 2*1e-5 : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 1e-10); int ldx; int ldy; @@ -61,59 +61,80 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarX(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); - Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - - Kokkos::fence(); + { + ScalarX randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarY randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_A,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + auto h_org_y = Kokkos::subview(h_b_org_y, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_A,b_A); typedef Kokkos::Details::ArithTraits KAT; - ScalarY expected_result = KAT:: zero(); + Kokkos::View expected("expected aAx+by", ldy); if(mode[0] == 'N') { for(int i = 0; i < M; i++) { - ScalarY y_i = KAT::zero (); + ScalarY y_i = beta * h_org_y(i); for(int j = 0; j < N; j++) { - y_i += h_A(i,j) * h_x(j); + y_i += alpha * h_A(i,j) * h_x(j); } - expected_result += (beta * h_y(i) + alpha * y_i) * (beta * h_y(i) + alpha * y_i) ; + expected(i) = y_i; } } else if(mode[0] == 'T') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += h_A(i,j) * h_x(i); + y_j += alpha * h_A(i,j) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } else if(mode[0] == 'C') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += KAT::conj (h_A(i,j)) * h_x(i); + y_j += alpha * KAT::conj (h_A(i,j)) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } KokkosBlas::gemv(mode, alpha, A, x, beta, y); - ScalarY nonconst_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i)); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y); - ScalarY const_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); - ScalarY const_const_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } } } @@ -203,7 +224,7 @@ TEST_F( TestCategory, gemv_complex_double ) { Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); - test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("T"); + test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("C"); Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas2_team_gemv.hpp b/unit_test/blas/Test_Blas2_team_gemv.hpp index 124941bfd8..f8a7f7c1be 100644 --- a/unit_test/blas/Test_Blas2_team_gemv.hpp +++ b/unit_test/blas/Test_Blas2_team_gemv.hpp @@ -64,8 +64,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 451b7fedac..580de25397 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -115,8 +115,6 @@ namespace Test { Kokkos::deep_copy(C2,C); - Kokkos::fence(); - struct VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; @@ -130,8 +128,6 @@ namespace Test { KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C); - Kokkos::fence(); - mag_type diff_C = 0; struct DiffGEMM diffgemm; diffgemm.N = N; diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp index 9f72bd5e63..4c8d154c15 100644 --- a/unit_test/blas/Test_Blas3_trmm.hpp +++ b/unit_test/blas/Test_Blas3_trmm.hpp @@ -121,7 +121,6 @@ namespace Test { Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0,K), nudtrmm); } Kokkos::fill_random(B, rand_pool, Kokkos::rand, ScalarA>::max()); - Kokkos::fence(); Kokkos::deep_copy(host_A, A); // Make host_A a lower triangle @@ -162,11 +161,9 @@ namespace Test { vgemm.beta = beta; Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(M,Kokkos::AUTO,16), vgemm); } - Kokkos::fence(); Kokkos::deep_copy(host_B_expected, B_expected); KokkosBlas::trmm(side, uplo, trans, diag, alpha, A, B); - Kokkos::fence(); Kokkos::deep_copy(host_B_actual, B); bool test_flag = true; diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp index 8fec44b637..ca9c40ae7e 100644 --- a/unit_test/blas/Test_Blas3_trsm.hpp +++ b/unit_test/blas/Test_Blas3_trsm.hpp @@ -127,8 +127,6 @@ namespace Test { ScalarA alpha_trmm = ScalarA(1)/alpha; ScalarA beta = ScalarA(0); - Kokkos::fence(); - if ((uplo[0]=='L')||(uplo[0]=='l')) { for (int i = 0; i < K-1; i++) for (int j = i+1; j < K; j++) From 7e41bbb428b8148d55b8c547e08c5c41f02f3359 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 23 Mar 2021 17:39:54 -0600 Subject: [PATCH 107/126] Fix #917: ctor CrsMat mirror with CrsGraph mirror Also add deep copy constructor for CrsMatrix. Takes a label and any other CrsMatrix<...>, as long as it has same scalar/index/offset types. --- src/sparse/KokkosSparse_CrsMatrix.hpp | 42 ++++++++++++++++------ unit_test/sparse/Test_Sparse_CrsMatrix.hpp | 27 ++++++++++++++ 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index d866a63601..3a3b20e603 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -411,7 +411,7 @@ class CrsMatrix { typedef SizeType size_type; //! Type of a host-memory mirror of the sparse matrix. - typedef CrsMatrix HostMirror; + typedef CrsMatrix HostMirror; //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. @@ -494,14 +494,36 @@ class CrsMatrix { //as the constructor of StaticCrsGraph does not allow copy from non const version. } + //! Deep copy constructor (can cross spaces) + template + CrsMatrix (const std::string& label, + const CrsMatrix& mat_) + { + typename row_map_type::non_const_type rowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap"), mat_.graph.row_map.extent(0)); + index_type cols(Kokkos::ViewAllocateWithoutInitializing("cols"), mat_.nnz()); + values = values_type(Kokkos::ViewAllocateWithoutInitializing("values"), mat_.nnz()); + Kokkos::deep_copy(rowmap, mat_.graph.row_map); + Kokkos::deep_copy(cols, mat_.graph.entries); + Kokkos::deep_copy(values, mat_.values); + + numCols_ = mat_.numCols(); + graph = StaticCrsGraphType(cols, rowmap); + +#ifdef KOKKOS_USE_CUSPARSE + cusparseCreate (&cusparse_handle); + cusparseCreateMatDescr (&cusparse_descr); +#endif // KOKKOS_USE_CUSPARSE + } + /// \brief Construct with a graph that will be shared. /// /// Allocate the values array for subsquent fill. - CrsMatrix (const std::string& arg_label, - const staticcrsgraph_type& arg_graph) : - graph (arg_graph), - values (arg_label, arg_graph.entries.extent(0)), - numCols_ (maximum_entry (arg_graph) + 1) + template + CrsMatrix (const std::string& label, + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), + values (label, graph_.entries.extent(0)), + numCols_ (maximum_entry (graph_) + 1) {} /// \brief Constructor that copies raw arrays of host data in @@ -609,11 +631,12 @@ class CrsMatrix { /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. - CrsMatrix (const std::string& /* label */, + template + CrsMatrix (const std::string&, const OrdinalType& ncols, const values_type& vals, - const staticcrsgraph_type& graph_) : - graph (graph_), + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), values (vals), numCols_ (ncols) { @@ -888,7 +911,6 @@ ctor_impl (const std::string &label, row_lengths[i] = rows[i + 1] - rows[i]; } - str = label; graph = Kokkos::create_staticcrsgraph (str.append (".graph"), row_lengths); typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 13513fef14..5f23141581 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -189,9 +189,36 @@ testCrsMatrix () //printf ("A is %d by %d\n", A.numRows (), A.numCols ()); } +template +void +testCrsMatrixHostMirror () +{ + using namespace Test; + using crs_matrix = KokkosSparse::CrsMatrix; + using crs_matrix_host = typename crs_matrix::HostMirror; + using crs_graph = typename crs_matrix::StaticCrsGraphType; + using crs_graph_host = typename crs_graph::HostMirror; + crs_matrix A = makeCrsMatrix(); + typename crs_matrix::values_type::HostMirror valuesHost("values host", A.nnz()); + typename crs_matrix::row_map_type::HostMirror rowmapHost("rowmap host", A.numRows() + 1); + typename crs_matrix::index_type::HostMirror entriesHost("entries host", A.nnz()); + crs_graph_host graphHost(entriesHost, rowmapHost); + //Test the two CrsMatrix constructors that take the StaticCrsGraph + crs_matrix_host Ahost1("Ahost1", graphHost); + crs_matrix_host Ahost2("Ahost2", A.numCols(), valuesHost, graphHost); + //Test deep copy constructor (can copy between any two spaces) + { + crs_matrix Bdev("B device", Ahost1); + crs_matrix_host Bhost("B host", A); + } +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ testCrsMatrix (); \ +} \ +TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + testCrsMatrixHostMirror (); \ } From b799cd0f4ed8817efd042891ec8f1324687c1cbe Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Thu, 25 Mar 2021 14:21:22 -0600 Subject: [PATCH 108/126] kokkoskernels_tribits minor changes; add gtest lib to perf_test directory cmake/kokkoskernels_tribits.cmake: - Fix parsed variable name - IF/ELSE target_link_libraries calls in non-Trilnos KOKKOSKERNELS_ADD_EXECUTABLE branch done to prevent two calls with same target (cmake configure error) perf_test/CMakeLists.txt: - Add gtest test lib to address comilation issues in Trilinos when including "gtest.h" with perf_test files i.e. "fatal error: gtest/gtest.h: No such file or directory" perf_test/blas/blas3/CMakeLists.txt: - Pass kokkoskernelsperf_gtest as TESTONLYLIBS to KOKKOSKERNELS_ADD_EXECUTABLE --- cmake/kokkoskernels_tribits.cmake | 7 ++++--- perf_test/CMakeLists.txt | 16 ++++++++++++++++ perf_test/blas/blas3/CMakeLists.txt | 2 +- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake index 4eebb97c7b..b023d7c4d2 100644 --- a/cmake/kokkoskernels_tribits.cmake +++ b/cmake/kokkoskernels_tribits.cmake @@ -149,12 +149,13 @@ IF (IS_ENABLED) IF (KOKKOSKERNELS_HAS_TRILINOS) TRIBITS_ADD_EXECUTABLE(${EXE_NAME} SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${TESTONLYLIBS}) + TESTONLYLIBS ${PARSE_TESTONLYLIBS}) ELSE() ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels ${PARSE_TESTONLYLIBS}) + ELSE () + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) ENDIF() ENDIF() ELSE() diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index fe3b3c51ba..08788d648d 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -10,6 +10,22 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) # build correctly with or without MPI, but only run them with a single # MPI process. +SET(GTEST_SOURCE_DIR ${PACKAGE_SOURCE_DIR}/tpls/gtest) + +KOKKOSKERNELS_ADD_TEST_LIBRARY( + kokkoskernelsperf_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc + ) +# Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. +TARGET_COMPILE_DEFINITIONS(kokkoskernelsperf_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0") +TARGET_INCLUDE_DIRECTORIES(kokkoskernelsperf_gtest PUBLIC $) + +#Gtest minimally requires C++11 +TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) + + + ADD_COMPONENT_SUBDIRECTORY(batched) ADD_COMPONENT_SUBDIRECTORY(graph) ADD_COMPONENT_SUBDIRECTORY(sparse) diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index 8f83bd6b99..73c094387c 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -1,8 +1,8 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tpls/gtest) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp + TESTONLYLIBS kokkoskernelsperf_gtest ) From 56b7433cc5d3faa49cd22ff8acfc84a06f492833 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 26 Mar 2021 12:04:04 -0600 Subject: [PATCH 109/126] CrsMatrix: make template args readable, consistent also test the deep-copy CrsMatrix constructor with a zero-extent rowmap. --- src/sparse/KokkosSparse_CrsMatrix.hpp | 26 +++++++++++----------- unit_test/sparse/Test_Sparse_CrsMatrix.hpp | 10 +++++++++ 2 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index 3a3b20e603..d734d9ac3a 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -473,13 +473,13 @@ class CrsMatrix { {} //! Copy constructor (shallow copy). - template + template KOKKOS_INLINE_FUNCTION - CrsMatrix (const CrsMatrix & B) : + CrsMatrix (const CrsMatrix & B) : graph (B.graph.entries, B.graph.row_map), values (B.values), dev_config (B.dev_config), @@ -495,9 +495,9 @@ class CrsMatrix { } //! Deep copy constructor (can cross spaces) - template - CrsMatrix (const std::string& label, - const CrsMatrix& mat_) + template + CrsMatrix (const std::string&, + const CrsMatrix& mat_) { typename row_map_type::non_const_type rowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap"), mat_.graph.row_map.extent(0)); index_type cols(Kokkos::ViewAllocateWithoutInitializing("cols"), mat_.nnz()); @@ -518,9 +518,9 @@ class CrsMatrix { /// \brief Construct with a graph that will be shared. /// /// Allocate the values array for subsquent fill. - template + template CrsMatrix (const std::string& label, - const Kokkos::StaticCrsGraph& graph_) : + const Kokkos::StaticCrsGraph& graph_) : graph (graph_.entries, graph_.row_map), values (label, graph_.entries.extent(0)), numCols_ (maximum_entry (graph_) + 1) @@ -631,11 +631,11 @@ class CrsMatrix { /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. - template + template CrsMatrix (const std::string&, const OrdinalType& ncols, const values_type& vals, - const Kokkos::StaticCrsGraph& graph_) : + const Kokkos::StaticCrsGraph& graph_) : graph (graph_.entries, graph_.row_map), values (vals), numCols_ (ncols) diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 5f23141581..85b427d445 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -211,6 +211,16 @@ testCrsMatrixHostMirror () crs_matrix Bdev("B device", Ahost1); crs_matrix_host Bhost("B host", A); } + //Test the empty (0x0, 0 entries) case - zero-length rowmap. + typename crs_graph::row_map_type::non_const_type zeroRowmap; + typename crs_graph::entries_type zeroEntries; + typename crs_matrix::values_type zeroValues; + crs_matrix zero("ZeroRow", 0, 0, 0, zeroValues, zeroRowmap, zeroEntries); + crs_matrix_host zeroHost("zero1Host", zero); + EXPECT_EQ(zeroHost.numRows(), 0); + EXPECT_EQ(zeroHost.numCols(), 0); + EXPECT_EQ(zeroHost.nnz(), 0); + EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0); } #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 274447524bb8b33d8fecc62dc6f2e6fbdfb72b29 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Fri, 26 Mar 2021 15:14:17 -0600 Subject: [PATCH 110/126] add new options for two-stage Gauss Seidel > damping factors for outer and inner sweeps > compact form (needs a fewer flops, may be of a lower-quality) also removed some codes, which are now not used --- src/common/KokkosKernels_Handle.hpp | 21 +- .../KokkosSparse_gauss_seidel_handle.hpp | 69 +- ...okkosSparse_twostage_gauss_seidel_impl.hpp | 713 ++++++++++++------ 3 files changed, 568 insertions(+), 235 deletions(-) diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 2e335d4f04..2be4d345ce 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -580,12 +580,24 @@ class KokkosKernelsHandle return gs2; } // ---------------------------------------- // - // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson or SpTrsv) + // Specify numer of outer sweeps for two-stage Gauss-Seidel + void set_gs_set_num_outer_sweeps (int num_outer_sweeps) { + auto gs2 = get_twostage_gs_handle(); + gs2->setNumOuterSweeps (num_outer_sweeps); + } + // ---------------------------------------- // + // Specify numer of inner sweeps for two-stage Gauss-Seidel void set_gs_set_num_inner_sweeps (int num_inner_sweeps) { auto gs2 = get_twostage_gs_handle(); gs2->setNumInnerSweeps (num_inner_sweeps); } // ---------------------------------------- // + // Specify damping factor of inner sweeps for two-stage Gauss-Seidel + void set_gs_set_inner_damp_factor (scalar_t_ damp_factor) { + auto gs2 = get_twostage_gs_handle(); + gs2->setInnerDampFactor (damp_factor); + } + // ---------------------------------------- // // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson or SpTrsv) void set_gs_twostage (bool two_stage, size_type nrows) { auto gs2 = get_twostage_gs_handle(); @@ -608,6 +620,13 @@ class KokkosKernelsHandle } } } + // ---------------------------------------- // + // Specify to use either Compact or Classical form of recurrence + void set_gs_twostage_compact_form (bool compact_form) { + auto gs2 = get_twostage_gs_handle(); + gs2->setCompactForm (compact_form); + } + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) { this->destroy_gs_handle(); diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index fd4a9b58d9..9b03c9d63e 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -573,8 +573,13 @@ namespace KokkosSparse{ nrhs (1), direction (GS_SYMMETRIC), two_stage (true), - num_inner_sweeps (1) - {} + compact_form (false), + num_inner_sweeps (1), + num_outer_sweeps (1) + { + const scalar_t one (1.0); + inner_omega = one; + } // Sweep direction void setSweepDirection (GSDirection direction_) { @@ -592,6 +597,22 @@ namespace KokkosSparse{ return this->two_stage; } + // specify whether to use compact form of recurrence + void setCompactForm (bool compact_form_) { + this->compact_form = compact_form_; + } + bool isCompactForm () { + return this->compact_form; + } + + // Number of outer sweeps + void setNumOuterSweeps (int num_outer_sweeps_) { + this->num_outer_sweeps = num_outer_sweeps_; + } + int getNumOuterSweeps () { + return this->num_outer_sweeps; + } + // Number of inner sweeps void setNumInnerSweeps (int num_inner_sweeps_) { this->num_inner_sweeps = num_inner_sweeps_; @@ -600,27 +621,57 @@ namespace KokkosSparse{ return this->num_inner_sweeps; } - // workspaces + // Inner damping factor + void setInnerDampFactor (scalar_t inner_omega_) { + this->inner_omega = inner_omega_; + } + scalar_t getInnerDampFactor () { + return this->inner_omega; + } + + // Workspaces + // > diagonal (inverse) void setD (values_view_t D_) { this->D = D_; } values_view_t getD () { return this->D; } - + // > Lower part of diagonal block void setL (crsmat_t L) { this->crsmatL = L; } crsmat_t getL () { return this->crsmatL; } - + // > Upper part of diagonal block void setU (crsmat_t U) { this->crsmatU = U; } crsmat_t getU () { return this->crsmatU; } + // > Complement of U + void setLa (crsmat_t La) { + this->crsmatLa = La; + } + crsmat_t getLa () { + return this->crsmatLa; + } + // > Complement of L + void setUa (crsmat_t Ua) { + this->crsmatUa = Ua; + } + crsmat_t getUa () { + return this->crsmatUa; + } + // > diagonal (not-inverse) + void setDa (values_view_t Da_) { + this->Da = Da_; + } + values_view_t getDa () { + return this->Da; + } void initVectors (int nrows_, int nrhs_) { if (this->nrows != nrows_ || this->nrhs != nrhs_) { @@ -650,6 +701,11 @@ namespace KokkosSparse{ values_view_t D; crsmat_t crsmatL; crsmat_t crsmatU; + // > complements for compact form of recurrence + // where La = A - U and Ua = A - U + values_view_t Da; + crsmat_t crsmatLa; + crsmat_t crsmatUa; // > residual vector for outer GS, Rk = B-A*Xk vector_view_t localR; @@ -661,7 +717,10 @@ namespace KokkosSparse{ // solver parameters GSDirection direction; bool two_stage; + bool compact_form; int num_inner_sweeps; + int num_outer_sweeps; + scalar_t inner_omega; }; // ------------------------------------- } diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f78a7e0cd4..73fb486c4e 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -61,7 +61,7 @@ #include "KokkosSparse_gauss_seidel_handle.hpp" -#define KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV +//#define KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS namespace KokkosSparse{ namespace Impl{ @@ -103,6 +103,10 @@ namespace KokkosSparse{ input_size_t>; using input_graph_t = typename input_crsmat_t::StaticCrsGraphType; using single_vector_view_t = Kokkos::View; + using internal_vector_view_t = typename TwoStageGaussSeidelHandleType::vector_view_t; + + using ST = Kokkos::Details::ArithTraits; + using mag_t = typename ST::mag_type; private: HandleType *handle; @@ -132,13 +136,11 @@ namespace KokkosSparse{ struct Tag_countNnzL{}; struct Tag_countNnzU{}; // tag for inserting entries - struct Tag_entriesL{}; - struct Tag_entriesU{}; struct Tag_entriesLU{}; // tag for inserting values - struct Tag_valuesL{}; - struct Tag_valuesU{}; struct Tag_valuesLU{}; + // tag for computing residual norm + struct Tag_normR{}; template ::one (); - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) < i) { - values (nnz) = values_view (k); - nnz ++; - } else if (column_view (k) == i) { - if (two_stage) { - if (diagos_given) { - diags (i) = d_invert_view (i); - } else { - diags (i) = one / values_view (k); - } - } else { - values (nnz) = values_view (k); - nnz ++; - } - } - } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - if (two_stage) { - for (size_type k = row_map (i); k < nnz; k++) { - values (k) *= diags (i); + if (i == 0) { + row_map_a (0) = 0; } } - #endif + nnz += nnz_i; } - // ------------------------------------------------------- // // functor for counting nnzU (with parallel_reduce) KOKKOS_INLINE_FUNCTION @@ -369,55 +384,18 @@ namespace KokkosSparse{ if (i == 0) { row_map (0) = 0; } - nnz += nnz_i; - } - - // functor for storing entriesU (with parallel_for) - KOKKOS_INLINE_FUNCTION - void operator()(const Tag_entriesU&, const ordinal_t i) const - { - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) > i && column_view (k) < num_rows) { - entries (nnz) = column_view (k); - nnz ++; - } else if(!two_stage && column_view (k) == i) { - entries (nnz) = column_view (k); - nnz ++; + if (compact_form) { + // complement of U+D + row_map_a (i+1) = (rowmap_view (i+1) - rowmap_view (i)) - nnz_i; + if (two_stage) { + // two-stage iterates with U (no D) + row_map_a (i+1) --; } - } - } - - // functor for storing valuesU (with parallel_for) - KOKKOS_INLINE_FUNCTION - void operator()(const Tag_valuesU&, const ordinal_t i) const - { - const_scalar_t one = Kokkos::Details::ArithTraits::one (); - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) == i) { - if (two_stage) { - if (diagos_given) { - diags (i) = d_invert_view (i); - } else { - diags (i) = one / values_view (k); - } - } else { - values (nnz) = values_view (k); - nnz ++; - } - } else if (column_view (k) > i && column_view (k) < num_rows) { - values (nnz) = values_view (k); - nnz ++; - } - } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - if (two_stage) { - for (size_type k = row_map (i); k < nnz; k++) { - values (k) *= diags (i); + if (i == 0) { + row_map_a (0) = 0; } } - #endif + nnz += nnz_i; } // ------------------------------------------------------- // @@ -427,6 +405,12 @@ namespace KokkosSparse{ { ordinal_t nnzL = row_map (i); ordinal_t nnzU = row_map2 (i); + ordinal_t nnzLa = 0; + ordinal_t nnzUa = 0; + if (compact_form) { + nnzLa = row_map_a (i); + nnzUa = row_map_a2 (i); + } if (!two_stage) { // NOTE: Kokkos' sptrsv assumes diagonal of U to be at the start entries2 (nnzU) = i; @@ -434,11 +418,32 @@ namespace KokkosSparse{ } for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { if (column_view (k) < i) { + // L entries (nnzL) = column_view (k); nnzL ++; - } else if (column_view (k) > i && column_view (k) < num_rows) { - entries2 (nnzU) = column_view (k); - nnzU ++; + if (compact_form) { + // complement of U+D + entries_a (nnzLa) = column_view (k); + nnzLa ++; + } + } else if (column_view (k) > i) { + if (column_view (k) < num_rows) { + // U + entries2 (nnzU) = column_view (k); + nnzU ++; + if (compact_form) { + // complement of L+D + entries_a2 (nnzUa) = column_view (k); + nnzUa ++; + } + } else if (compact_form) { + // complement of U+D + entries_a (nnzLa) = column_view (k); + nnzLa ++; + // complement of L+D + entries_a2 (nnzUa) = column_view (k); + nnzUa ++; + } } } if (!two_stage) { @@ -455,6 +460,12 @@ namespace KokkosSparse{ const_scalar_t one = Kokkos::Details::ArithTraits::one (); ordinal_t nnzL = row_map (i); ordinal_t nnzU = row_map2 (i); + ordinal_t nnzLa = 0; + ordinal_t nnzUa = 0; + if (compact_form) { + nnzLa = row_map_a (i); + nnzUa = row_map_a2 (i); + } if (!two_stage) { // Kokkos' sptrsv assumes diagonal U to come at the start, so increment nnzU nnzU ++; @@ -464,6 +475,11 @@ namespace KokkosSparse{ // save L (without diag) values (nnzL) = values_view (k); nnzL ++; + if (compact_form) { + // complement of U+D + values_a (nnzLa) = values_view (k); + nnzLa ++; + } } else if (column_view (k) == i) { // save D if (diagos_given) { @@ -473,10 +489,27 @@ namespace KokkosSparse{ // as original diags (i) = values_view (k); } - } else if (column_view (k) < num_rows) { - // save U (without diag) - values2 (nnzU) = values_view (k); - nnzU ++; + if (compact_form) { + diags_a (i) = values_view (k); + } + } else { + if (column_view (k) < num_rows) { + // save U (without diag) + values2 (nnzU) = values_view (k); + nnzU ++; + if (compact_form) { + // complement of L+D + values_a2 (nnzUa) = values_view (k); + nnzUa ++; + } + } else if (compact_form) { + // complement of U+D + values_a (nnzLa) = values_view (k); + nnzLa ++; + // complement of L+D + values_a2 (nnzUa) = values_view (k); + nnzUa ++; + } } } if (!two_stage) { @@ -485,27 +518,50 @@ namespace KokkosSparse{ nnzU = row_map2 (i); if (diagos_given) { values2 (nnzU) = one / diags (i); - values (nnzL) = one / diags (i); + values (nnzL) = one / diags (i); } else { values2 (nnzU) = diags (i); - values (nnzL) = diags (i); + values (nnzL) = diags (i); } } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (two_stage) { if (!diagos_given) { // when diag is provided, it is already provided as inverse diags (i) = one / diags (i); } - // compute inv(D)*L + // compute inv(D)*L (apply row-scaling to valueL) for (size_type k = row_map (i); k < row_map (i+1); k++) { values (k) *= diags (i); } + // compute inv(D)*U (apply row-scaling to valueU) for (size_type k = row_map2 (i); k < row_map2 (i+1); k++) { values2 (k) *= diags (i); } } - #endif + } + + // ------------------------------------------------------- // + // functor for computing residual norm (with parallel_reduce) + KOKKOS_INLINE_FUNCTION + void operator()(const Tag_normR&, const ordinal_t i, mag_t &normR) const + { + scalar_t normRi = localB (i, 0); + if (forward_sweep) { + // compute R(i) = B(i) - (L+D)(i,:)*X + for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { + if (column_view (k) <= i) { + normRi -= values_view (k) * localX (column_view (k), 0); + } + } + } else { + // compute R(i) = B(i) - (D+U)(i,:)*X + for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { + if (column_view (k) >= i && column_view (k) < num_rows) { + normRi -= values_view (k) * localX (column_view (k), 0); + } + } + } + normR += ST::abs (normRi * normRi); } }; // --------------------------------------------------------- // @@ -574,27 +630,44 @@ namespace KokkosSparse{ #endif auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); GSDirection direction = gsHandle->getSweepDirection (); using GS_Functor_t = TwostageGaussSeidel_functor; // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores offsets for each row) - ordinal_t nnzL = 0; - ordinal_t nnzU = 0; - row_map_view_t rowmap_viewL ("row_mapL", num_rows+1); - row_map_view_t rowmap_viewU ("row_mapU", num_rows+1); + ordinal_t nnzA = column_view.extent (0); + ordinal_t nnzL = 0; // lower-part of diagonal block + ordinal_t nnzU = 0; // upper-part of diagonal block + row_map_view_t rowmap_viewL ("row_mapL", num_rows+1); // lower-part of diagonal block + row_map_view_t rowmap_viewU ("row_mapU", num_rows+1); // upper-part of diagonal block + row_map_view_t rowmap_viewLa ("row_mapLa", num_rows+1); // complement of U+D + row_map_view_t rowmap_viewUa ("row_mapUa", num_rows+1); // complement of L+D if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_reduce ("nnzL", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewL), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewL, rowmap_viewUa), nnzL); } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_reduce ("nnzU", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewU), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewU, rowmap_viewLa), nnzU); } + ordinal_t nnzLa = 0; // complement of U+D + ordinal_t nnzUa = 0; // complement of L+D + if (compact_form) { + nnzLa = nnzA - nnzU; + nnzUa = nnzA - nnzL; + if (two_stage) { + // two-stage iterates with L or U (no D) + nnzLa -= num_rows; + nnzUa -= num_rows; + } + } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -605,10 +678,18 @@ namespace KokkosSparse{ if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum (1+num_rows, rowmap_viewL); + if (compact_form) { + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum + (1+num_rows, rowmap_viewLa); + } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum (1+num_rows, rowmap_viewU); + if (compact_form) { + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum + (1+num_rows, rowmap_viewUa); + } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); @@ -626,6 +707,14 @@ namespace KokkosSparse{ // allocate memory to store local U entries_view_t column_viewU (Kokkos::ViewAllocateWithoutInitializing("entriesU"), nnzU); values_view_t values_viewU (Kokkos::ViewAllocateWithoutInitializing("valuesU"), nnzU); + + // allocate memory to store complement of U+D + entries_view_t column_viewLa (Kokkos::ViewAllocateWithoutInitializing("entriesLa"), nnzLa); + values_view_t values_viewLa (Kokkos::ViewAllocateWithoutInitializing("valuesLa"), nnzLa); + + // allocate memory to store complement of L+D + entries_view_t column_viewUa (Kokkos::ViewAllocateWithoutInitializing("entriesUa"), nnzUa); + values_view_t values_viewUa (Kokkos::ViewAllocateWithoutInitializing("valuesUa"), nnzUa); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -636,10 +725,14 @@ namespace KokkosSparse{ { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy ; - Kokkos::parallel_for ("entryLU", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewL, column_viewL, - rowmap_viewU, column_viewU)); + Kokkos::parallel_for ("entriesLU", range_policy (0, num_rows), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewL, column_viewL, + rowmap_viewU, column_viewU, + // + rowmap_viewLa, column_viewLa, + rowmap_viewUa, column_viewUa)); } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); @@ -658,6 +751,22 @@ namespace KokkosSparse{ gsHandle->setL (crsmatL); gsHandle->setU (crsmatU); gsHandle->setD (viewD); + + if (compact_form) { + // construct complements + graph_t graphLa (column_viewLa, rowmap_viewLa); + graph_t graphUa (column_viewUa, rowmap_viewUa); + crsmat_t crsmatLa ("La", num_rows, values_viewLa, graphLa); + crsmat_t crsmatUa ("Ua", num_rows, values_viewUa, graphUa); + + // store them in handle + gsHandle->setLa (crsmatLa); + gsHandle->setUa (crsmatUa); + + values_view_t viewDa (Kokkos::ViewAllocateWithoutInitializing("diags"), num_rows); + gsHandle->setDa (viewDa); + } + if (!(gsHandle->isTwoStage ())) { // create SpTRSV handles for classical GS using namespace KokkosSparse::Experimental; @@ -685,9 +794,11 @@ namespace KokkosSparse{ auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); // load local D from handle auto viewD = gsHandle->getD (); + auto viewDa = gsHandle->getDa (); // load local L from handle auto crsmatL = gsHandle->getL (); @@ -701,13 +812,26 @@ namespace KokkosSparse{ auto rowmap_viewU = crsmatU.graph.row_map; auto column_viewU = crsmatU.graph.entries; + // load complement of U+D from handle + auto crsmatLa = gsHandle->getLa (); + auto values_viewLa = crsmatLa.values; + auto rowmap_viewLa = crsmatLa.graph.row_map; + + // load complement of L+D from handle + auto crsmatUa = gsHandle->getUa (); + auto values_viewUa = crsmatUa.values; + auto rowmap_viewUa = crsmatUa.graph.row_map; + + // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_for ("valueLU", range_policy (0, num_rows), - GS_Functor_t (two_stage, diagos_given, num_rows, + GS_Functor_t (two_stage, compact_form, diagos_given, num_rows, rowmap_view, column_view, values_view, d_invert_view, - rowmap_viewL, column_viewL, values_viewL, viewD, - rowmap_viewU, column_viewU, values_viewU)); + rowmap_viewL, values_viewL, viewD, + rowmap_viewU, values_viewU, + rowmap_viewLa, values_viewLa, viewDa, + rowmap_viewUa, values_viewUa)); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -742,7 +866,7 @@ namespace KokkosSparse{ y_value_array_type localB, // in bool init_zero_x_vector = false, int numIter = 1, - scalar_t omega = Kokkos::Details::ArithTraits::one(), + scalar_t omega = ST::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) @@ -759,6 +883,9 @@ namespace KokkosSparse{ // auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); + scalar_t gamma = gsHandle->getInnerDampFactor (); + GSDirection direction = gsHandle->getSweepDirection (); if (apply_forward && apply_backward) { direction = GS_SYMMETRIC; @@ -772,8 +899,11 @@ namespace KokkosSparse{ // load auxiliary matrices from handle auto localD = gsHandle->getD (); - auto crsmatL = gsHandle->getL (); - auto crsmatU = gsHandle->getU (); + auto crsmatL = gsHandle->getL (); // lower-part of diagonal block + auto crsmatU = gsHandle->getU (); // upper-part of diagonal block + auto localDa = gsHandle->getDa (); + auto crsmatLa = gsHandle->getLa (); // complement of L+D (used only for compact form) + auto crsmatUa = gsHandle->getUa (); // complement of U+D (used only for compact form) // wratp A into crsmat input_graph_t graphA (column_view, rowmap_view); @@ -781,21 +911,25 @@ namespace KokkosSparse{ #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); + std::cout << std::endl << "TWO-STAGE GS::APPLY with " << numIter << " outer GS sweeps with omega = " << omega + << ", and " << gsHandle->getNumInnerSweeps () << " inner JR sweeps, with gamma = " << gamma + << " (numRows=" << num_rows << ")" + << std::endl; std::cout << std::endl << "TWO-STAGE GS::APPLY::CREATE CRS_A TIME : " << tic << std::endl; timer.reset(); #endif // load auxiliary vectors - int nrows = num_rows; int nrhs = localX.extent (1); - gsHandle->initVectors (nrows, nrhs); + gsHandle->initVectors (num_rows, nrhs); auto localR = gsHandle->getVectorR (); auto localT = gsHandle->getVectorT (); auto localZ = gsHandle->getVectorZ (); // outer Gauss-Seidel iteration - int NumSweeps = numIter; + int NumOuterSweeps = gsHandle->getNumOuterSweeps (); int NumInnerSweeps = gsHandle->getNumInnerSweeps (); + int NumSweeps = (NumOuterSweeps > numIter ? NumOuterSweeps : numIter); if (direction == GS_SYMMETRIC) { NumSweeps *= 2; } @@ -803,103 +937,224 @@ namespace KokkosSparse{ KokkosKernels::Impl::zero_vector(nrhs, localX); } for (int sweep = 0; sweep < NumSweeps; ++sweep) { - // R = B - A*x + bool forward_sweep = (direction == GS_FORWARD || + (direction == GS_SYMMETRIC && sweep%2 == 0)); + // compute residual vector KokkosBlas::scal (localR, one, localB); if (sweep > 0 || !init_zero_x_vector) { - KokkosSparse:: - spmv ("N", scalar_t(-one), crsmatA, - localX, - one, localR); + if (compact_form) { + if (forward_sweep) { + // R = B - U*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatUa, + localX, + one, localR); + } else { + // R = B - L*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatLa, + localX, + one, localR); + } + if (omega != one) { + // R = B - (U + (1-1/omega)D)*x + scalar_t omega2 = (one/omega - one); + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + KokkosBlas::mult (zero, localZ, + one, localDa, localY); + KokkosBlas::axpy (omega2, localZ, localR); + } + } else { // not compact_form + // R = B - A*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, localR); +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1)); + single_vector_view_t Rj (localRj.data (), num_rows); + std::cout << "norm(GS)-" << sweep << " " << KokkosBlas::nrm2 (Rj) + << " (" << (forward_sweep ? "forward" : "backward" ) << ")" + << std::endl; + } +#endif + } } - if (!two_stage) { // ===== sparse-triangular solve ===== - if (direction == GS_FORWARD || - (direction == GS_SYMMETRIC && sweep%2 == 0)) { - // Z = (L+D)^{-1} * R + //if (sweep == 0) { + // auto localY = Kokkos::subview (localX, range_type(num_rows, localX.extent(0)), Kokkos::ALL ()); + // Kokkos::deep_copy (localY, zero); + //} + if (!two_stage) { + // ===== sparse-triangular solve ===== + // TODO: omega is not supported here (because omega*L + D is extracted in initialize_numeric, but omega is passed into apply) + // hence, omega = one + if (forward_sweep) { + // Z = (omega * L + D)^{-1} * R // NOTE: need to go over RHSs using namespace KokkosSparse::Experimental; for (int j = 0; j < nrhs; j++) { auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1)); auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1)); - single_vector_view_t Rj (localRj.data (), nrows); - single_vector_view_t Zj (localZj.data (), nrows); + single_vector_view_t Rj (localRj.data (), num_rows); + single_vector_view_t Zj (localZj.data (), num_rows); sptrsv_solve (handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, crsmatL.graph.entries, crsmatL.values, Rj, Zj); } } else { using namespace KokkosSparse::Experimental; - // Z = (U+D)^{-1} * R + // Z = (omega * U + D)^{-1} * R // NOTE: need to go over RHSs for (int j = 0; j < nrhs; j++) { auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1)); auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1)); - single_vector_view_t Rj (localRj.data (), nrows); - single_vector_view_t Zj (localZj.data (), nrows); + single_vector_view_t Rj (localRj.data (), num_rows); + single_vector_view_t Zj (localZj.data (), num_rows); sptrsv_solve (handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, crsmatU.graph.entries, crsmatU.values, Rj, Zj); } } - } else { // ====== inner Jacobi-Richardson ===== + + // update solution (no omega) + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + if (compact_form) { + // Y = omega * Z + KokkosBlas::scal (localY, one, localZ); + } else { + // Y = Y + omega * Z + KokkosBlas::axpy (one, localZ, localY); + } + } else { + // ====== inner Jacobi-Richardson ===== +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + //compute initial residual norm + // > compute RHS for the inner loop, R = B - A*x + internal_vector_view_t tempR ("tempR", num_rows, 1); + KokkosBlas::scal (tempR, one, localB); + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, tempR); + // > initial vector for the inner loop is zero + Kokkos::deep_copy (localZ, zero); + using Norm_Functor_t = TwostageGaussSeidel_functor; + using range_policy = Kokkos::RangePolicy ; + { + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localZ, tempR), + normR); + std::cout << "> norm(JR)-" << 0 << " " << sqrt(normR) << std::endl; + } +#endif // compute starting vector: Z = D^{-1}*R (Z is correction, i.e., output of JR) - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (NumInnerSweeps == 0) { // this is Jacobi-Richardson X_{k+1} := X_{k} + D^{-1}(b-A*X_{k}) // copy to localZ (output of JR iteration) + + // row-scale: (D^{-1}*L)*Y = D^{-1}*B + // compute Z := D^{-1}*R KokkosBlas::mult (zero, localZ, one, localD, localR); + // apply inner damping factor, if not one + if (gamma != one) { + // Z = gamma * Z + KokkosBlas::scal (localZ, gamma, localZ); + } } else { // copy to localT (workspace used to save D^{-1}*R for JR iteration) KokkosBlas::mult (zero, localT, one, localD, localR); // initialize Jacobi-Richardson (using R as workspace for JR iteration) KokkosBlas::scal (localR, one, localT); + + // apply inner damping factor, if not one + if (gamma != one) { + // R = gamma * R + KokkosBlas::scal (localR, gamma, localR); + } + } +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // compute residual norm of the starting vector (D^{-1}R) + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localT, tempR), + normR); + std::cout << "> norm(JR)-" << 1 << " " << sqrt(normR) << std::endl; } - #else - KokkosBlas::mult (zero, localT, - one, localD, localR); - #endif +#endif // inner Jacobi-Richardson: for (int ii = 0; ii < NumInnerSweeps; ii++) { - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U // copy T into Z KokkosBlas::scal (localZ, one, localT); - #else - // Z = R - KokkosBlas::scal (localZ, one, localR); - #endif - if (direction == GS_FORWARD || - (direction == GS_SYMMETRIC && sweep%2 == 0)) { + if (forward_sweep) { // Z = Z - L*R KokkosSparse:: - spmv("N", scalar_t(-one), crsmatL, - localR, - one, localZ); + spmv("N", scalar_t(-omega), crsmatL, + localR, + one, localZ); } else { // Z = R - U*T KokkosSparse:: - spmv("N", scalar_t(-one), crsmatU, - localR, - one, localZ); + spmv("N", scalar_t(-omega), crsmatU, + localR, + one, localZ); + } + // apply inner damping factor, if not one + if (gamma != one) { + // Z = gamma * Z + KokkosBlas::scal (localZ, gamma, localZ); + // Z = Z + (one - one/gamma) * R + scalar_t gamma2 = one - gamma; + KokkosBlas::axpy (gamma2, localR, localZ); } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (ii+1 < NumInnerSweeps) { // reinitialize (R to be Z) KokkosBlas::scal (localR, one, localZ); } - #else - // T = D^{-1}*Z - KokkosBlas::mult (zero, localT, - one, localD, localZ); - #endif +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // compute residual norm(r - (L+D)*y) + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localZ, tempR), + normR); + std::cout << "> norm(JR)-" << 2+ii << " " << sqrt(normR) << std::endl; + } +#endif } // end of inner Jacobi Richardson - } - // Y = X + T - auto localY = Kokkos::subview (localX, range_type(0, nrows), Kokkos::ALL ()); - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - KokkosBlas::axpy (one, localZ, localY); - #else - KokkosBlas::axpy (one, localT, localY); - #endif + + // update solution + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + if (compact_form) { + // Y := omega * z + KokkosBlas::scal (localY, omega, localZ); + } else { + // Y := X + omega * Z + KokkosBlas::axpy (omega, localZ, localY); + } + } // end of inner GS sweep } // end of outer GS sweep +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // R = B - A*x + KokkosBlas::scal (localR, one, localB); + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, localR); + auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1)); + single_vector_view_t Rj (localRj.data (), num_rows); + std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2 (Rj) << std::endl; + } +#endif } }; } From 0dbdf3ce52803f5bd449fda4eba56b000b66f08b Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 23 Mar 2021 17:53:44 -0600 Subject: [PATCH 111/126] Update testing scripts Load newer gcc with clang/9, clang/10 Update slurm for voltrino Update modules on blake Add openmptarget, sycl as device options --- cm_generate_makefile.bash | 8 ++++++ scripts/cm_test_all_sandia | 6 +++- scripts/update_lib.sh | 57 ++++++++++++++++++++++++++------------ 3 files changed, 53 insertions(+), 18 deletions(-) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index d8cfa6a18c..bb9913b05b 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -225,6 +225,8 @@ display_help_text() { echo "" echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." + echo "--with-openmptarget: Enable OpenMPTarget backend." + echo "--with-sycl: Enable Sycl backend." echo "--with-openmp: Enable OpenMP backend." echo "--with-pthread: Enable Pthreads backend." echo "--with-serial: Enable Serial backend." @@ -396,6 +398,12 @@ do --with-openmp) update_kokkos_devices OpenMP ;; + --with-openmptarget) + update_kokkos_devices OpenMPTarget + ;; + --with-sycl) + update_kokkos_devices Sycl + ;; --with-pthread) update_kokkos_devices Pthread ;; diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 58c265458e..9f46614770 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -609,7 +609,7 @@ elif [ "$MACHINE" = "voltrino" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,/,gcc/9.3.0" + BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -664,6 +664,8 @@ elif [ "$MACHINE" = "blake" ]; then BASE_MODULE_LIST="cmake/3.19.3,/" BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" + ONEAPI_WARNING_FLAGS="" GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" @@ -688,12 +690,14 @@ elif [ "$MACHINE" = "blake" ]; then "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index 822efa28b8..34ab5dd3c9 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,30 +1,53 @@ #!/bin/bash -if [ "$1" = blake ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then - module swap gcc/4.9.3 gcc/6.4.0 - module list - fi -fi -if [ "$1" = kokkos-dev ]; then +local machine_input="$1" +local compiler_input="$2" + +check_sems_intel() { ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list fi -fi -if [ "$1" = kokkos-dev-2 ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 19.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/6.1.0 sems-gcc/7.2.0 module list fi -fi -if [ "$1" = sems ]; then +} + +check_sems_clang() { + CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3) + if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/5.3.0 sems-gcc/6.4.0 + module list + fi +} + +check_compiler_modules() { + if [[ "$compiler_input" = clang/* ]]; then + echo " clang compiler - check supporting modules" + check_sems_clang + elif [[ "$compiler_input" = intel/* ]]; then + echo " intel compiler - check supporting modules" + check_sems_intel + fi +} + +if [ "$machine_input" = blake ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then + module swap gcc/4.9.3 gcc/6.4.0 module list fi fi +if [ "$machine_input" = kokkos-dev ]; then + check_compiler_modules +fi +if [ "$machine_input" = kokkos-dev-2 ]; then + check_compiler_modules +fi +if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then + check_compiler_modules +fi From 5c665027526b72fc8ecf98c3321eac5e967dca20 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Mon, 29 Mar 2021 15:14:38 -0600 Subject: [PATCH 112/126] fixing typos and removing commented-out code --- src/sparse/KokkosSparse_gauss_seidel_handle.hpp | 2 +- .../impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index 9b03c9d63e..9176809115 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -702,7 +702,7 @@ namespace KokkosSparse{ crsmat_t crsmatL; crsmat_t crsmatU; // > complements for compact form of recurrence - // where La = A - U and Ua = A - U + // where La = A - U and Ua = A - L values_view_t Da; crsmat_t crsmatLa; crsmat_t crsmatUa; diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 73fb486c4e..3e0022989f 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -981,13 +981,11 @@ namespace KokkosSparse{ #endif } } - //if (sweep == 0) { - // auto localY = Kokkos::subview (localX, range_type(num_rows, localX.extent(0)), Kokkos::ALL ()); - // Kokkos::deep_copy (localY, zero); - //} if (!two_stage) { // ===== sparse-triangular solve ===== - // TODO: omega is not supported here (because omega*L + D is extracted in initialize_numeric, but omega is passed into apply) + // TODO: omega is not supported here + // (L + D is extracted in initialize_numeric, + // but (omega*L + D)^{-1} needs to be applied with omega passed into apply) // hence, omega = one if (forward_sweep) { // Z = (omega * L + D)^{-1} * R From b23a8d9e4f66b2e5dc3cfa8b07b2936e4f6d56b0 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Mon, 29 Mar 2021 15:21:10 -0600 Subject: [PATCH 113/126] use nnz_scalar_t instead of scalar_t_ --- src/common/KokkosKernels_Handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 2be4d345ce..39ac62267c 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -593,7 +593,7 @@ class KokkosKernelsHandle } // ---------------------------------------- // // Specify damping factor of inner sweeps for two-stage Gauss-Seidel - void set_gs_set_inner_damp_factor (scalar_t_ damp_factor) { + void set_gs_set_inner_damp_factor (nnz_scalar_t damp_factor) { auto gs2 = get_twostage_gs_handle(); gs2->setInnerDampFactor (damp_factor); } From d53452fad86c8361997a294000dfa83d7653c1e3 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Mon, 29 Mar 2021 23:35:31 -0600 Subject: [PATCH 114/126] throw invalid-arg exception when omega != one using SpTRSV with two-stage GS --- src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 3e0022989f..7b2db4ba9a 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -987,6 +987,9 @@ namespace KokkosSparse{ // (L + D is extracted in initialize_numeric, // but (omega*L + D)^{-1} needs to be applied with omega passed into apply) // hence, omega = one + if (omega != one) { + throw std::invalid_argument (" *** TwostageGaussSeidel::apply with omega != one is not supported with sptrsv ***\n"); + } if (forward_sweep) { // Z = (omega * L + D)^{-1} * R // NOTE: need to go over RHSs From 311a38e3fb6c60dd017d09f6f9c0da0ebbf7b406 Mon Sep 17 00:00:00 2001 From: iyamaza Date: Tue, 30 Mar 2021 01:19:17 -0600 Subject: [PATCH 115/126] set omega=one for unit-test with two-stage using sptrsv --- unit_test/sparse/Test_Sparse_gauss_seidel.hpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 9993d46e22..cbdb673bb1 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -97,6 +97,8 @@ int run_gauss_seidel( KernelHandle; + scalar_t omega(0.9); + KernelHandle kh; kh.set_team_work_size(16); kh.set_dynamic_scheduling(true); @@ -106,6 +108,11 @@ int run_gauss_seidel( // test for two-stage/classical gs kh.create_gs_handle(gs_algorithm); kh.set_gs_twostage(!classic, input_mat.numRows()); + if (classic) { + // two-stage with SpTRSV supports only omega = one + const scalar_t one = Kokkos::Details::ArithTraits::one (); + omega = one; + } } else kh.create_gs_handle(GS_DEFAULT); @@ -120,8 +127,6 @@ int run_gauss_seidel( gauss_seidel_numeric (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); - scalar_t omega(0.9); - switch (apply_type){ case 0: symmetric_gauss_seidel_apply From d38a65f04b30fdbc378e77be3f76c4b4b2ac7e12 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Apr 2021 15:12:06 -0400 Subject: [PATCH 116/126] Workaround using new/delete in kernel code --- .../impl/KokkosGraph_Distance1Color_impl.hpp | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 110756a364..eeba934446 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2058,25 +2058,33 @@ class GraphColor_VBD:public GraphColor newFrontierSize_; size_type maxColors_; color_view_type colors_; - - functorDeterministicColoring(const_lno_row_view_t rowPtr, - const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, - color_view_type colors) - : xadj_(rowPtr), adj_(colInd), dependency_(dependency), frontier_(frontier), - frontierSize_(frontierSize), newFrontier_(newFrontier), newFrontierSize_(newFrontierSize), - maxColors_(maxColors), colors_(colors) {} + Kokkos::View bannedColors_; + + functorDeterministicColoring( + const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, + nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, + size_type maxColors, color_view_type colors) + : xadj_(rowPtr), + adj_(colInd), + dependency_(dependency), + frontier_(frontier), + frontierSize_(frontierSize), + newFrontier_(newFrontier), + newFrontierSize_(newFrontierSize), + maxColors_(maxColors), + colors_(colors), + bannedColors_("KokkosKernels::bannedColors", + maxColors_ * frontier.size()) {} KOKKOS_INLINE_FUNCTION void operator() (const size_type frontierIdx) const { typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); - int* bannedColors = new int[maxColors_]; + int *bannedColors = bannedColors_.data() + maxColors_ * frontierIdx; for(size_type colorIdx= 0; colorIdx < maxColors_; ++colorIdx) { bannedColors[colorIdx] = 0; } @@ -2105,7 +2113,6 @@ class GraphColor_VBD:public GraphColor Date: Thu, 1 Apr 2021 17:19:02 -0400 Subject: [PATCH 117/126] Use 2D view instead --- src/graph/impl/KokkosGraph_Distance1Color_impl.hpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index eeba934446..3adda031df 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2058,7 +2058,7 @@ class GraphColor_VBD:public GraphColor newFrontierSize_; size_type maxColors_; color_view_type colors_; - Kokkos::View bannedColors_; + Kokkos::View bannedColors_; functorDeterministicColoring( const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, @@ -2077,21 +2077,20 @@ class GraphColor_VBD:public GraphColor ::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); - int *bannedColors = bannedColors_.data() + maxColors_ * frontierIdx; for(size_type colorIdx= 0; colorIdx < maxColors_; ++colorIdx) { - bannedColors[colorIdx] = 0; + bannedColors_(frontierIdx, colorIdx) = 0; } // Loop over neighbors, find banned colors, decrement dependency and update newFrontier for(size_type neigh = xadj_(frontierNode); neigh < xadj_(frontierNode + 1); ++neigh) { - bannedColors[colors_(adj_(neigh))] = 1; + bannedColors_(frontierIdx, colors_(adj_(neigh))) = 1; // We want to avoid the cost of atomic operations when not needed // so let's check that the node is not already colored, i.e. @@ -2108,7 +2107,7 @@ class GraphColor_VBD:public GraphColor Date: Thu, 1 Apr 2021 13:52:03 -0400 Subject: [PATCH 118/126] SYCL: adding ETI and CMake logic for SYCL backend --- cmake/KokkosKernelsConfig.cmake.in | 1 + cmake/KokkosKernels_config.h.in | 4 ++ cmake/kokkos_backends.cmake | 1 + cmake/kokkoskernels_eti_devices.cmake | 44 ++++++++++++++++++--- src/Kokkos_ArithTraits.hpp | 44 ++++++++++++++++----- src/batched/KokkosBatched_Util.hpp | 20 ++++++++++ src/common/KokkosKernels_ExecSpaceUtils.hpp | 24 ++++++++++- test_common/Test_Common_ArithTraits.hpp | 3 +- 8 files changed, 124 insertions(+), 17 deletions(-) diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index 31d77bda94..6b95ff91ae 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -12,6 +12,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) +SET(Kokkos_ENABLE_SYCL @Kokkos_ENABLE_SYCL@) SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index 7cb277baed..9326edc47a 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -37,6 +37,10 @@ /* Whether to build kernels for execution space Kokkos::Experimental::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE +/* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ +#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP /* Whether to build kernels for execution space Kokkos::Threads */ diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index c2f46bb8e3..eb7d8602b7 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -11,6 +11,7 @@ ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) +CHECK_KOKKOS_BACKEND(SYCL) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(PTHREAD) CHECK_KOKKOS_BACKEND(SERIAL) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ede934023c..ad7ef15e55 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -5,12 +5,14 @@ SET(EXEC_SPACES EXECSPACE_CUDA EXECSPACE_HIP + EXECSPACE_SYCL EXECSPACE_OPENMP EXECSPACE_PTHREAD EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) @@ -19,14 +21,18 @@ SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE MEMSPACE_HIPSPACE + MEMSPACE_SYCLSPACE + MEMSPACE_SYCLSHAREDSPACE MEMSPACE_HOSTSPACE MEMSPACE_HBWSPACE ) -SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) -SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) -SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) -SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) +SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) +SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) +SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) +SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) +SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( @@ -85,6 +91,33 @@ IF(KOKKOS_ENABLE_HIP) ENDIF() +IF(KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_OPTION( + INST_EXECSPACE_SYCL + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::SYCL. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_SYCLSPACE + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::SYCLSpace. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + + IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) + MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with SYCL and complex double enabled.") + ENDIF() + +ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HOSTSPACE ${KOKKOSKERNELS_ADD_DEFAULT_ETI} @@ -138,6 +171,7 @@ KOKKOSKERNELS_ADD_OPTION( SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) +SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 83c483a3d6..48a6a47ea4 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -858,13 +858,17 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const float x) { @@ -1030,18 +1034,38 @@ class ArithTraits > { return std::complex (ArithTraits::infinity (), ArithTraits::infinity ()); } - static bool isInf (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + template + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (real (x)) || isinf (imag (x)); } - static bool isNan (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ENABLE_SYCL + template <> + static bool isInf(const std::complex& x) { + Kokkos::abort("isInf not available for std::complex!\n"); + return true; + } +#endif + template + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (real (x)) || isnan (imag (x)); } +#ifdef KOKKOS_ENABLE_SYCL + template <> + static bool isNan(const std::complex& x) { + Kokkos::abort("isNan not available for std::complex!\n"); + return true; + } +#endif static mag_type abs (const std::complex& x) { return std::abs (x); } @@ -1214,13 +1238,13 @@ class ArithTraits { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #endif - return isinf (x); + return sycl::isinf(x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #endif - return isnan (x); + return sycl::isnan(x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { return ::fabs (x); diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index eb9883c425..89dd200150 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -282,6 +282,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 2; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 2; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> @@ -331,6 +341,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 1; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 1; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 59bcf487fb..1106dc2ddc 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,7 +53,15 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; +enum ExecSpaceType { + Exec_SERIAL, + Exec_OMP, + Exec_PTHREADS, + Exec_QTHREADS, + Exec_CUDA, + Exec_HIP, + Exec_SYCL +}; template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; @@ -87,6 +95,12 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined(KOKKOS_ENABLE_SYCL) + if (std::is_same::value) { + exec_space = Exec_SYCL; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -115,6 +129,14 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space +constexpr KOKKOS_INLINE_FUNCTION bool +kk_is_gpu_exec_space() { + return true; +} +#endif + //Host function to determine free and total device memory. //Will throw if execution space doesn't support this. template diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index bba54ff6f0..4fab021e66 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -1634,7 +1634,8 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); -#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP ) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); From cbd1a71ef3e08faaedd106f4c657a2cf58819f49 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Apr 2021 14:39:13 -0400 Subject: [PATCH 119/126] Fix preprocessor guard --- src/common/KokkosKernels_ExecSpaceUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 1106dc2ddc..9e06bc45f2 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -129,7 +129,7 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space() { From 1ad9c3ec5507a4e7dc4cd816171f772a59cbbe56 Mon Sep 17 00:00:00 2001 From: Daniel Arndt Date: Thu, 1 Apr 2021 15:07:04 -0400 Subject: [PATCH 120/126] Remove wrong sycl::prefix --- src/Kokkos_ArithTraits.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 48a6a47ea4..c8a20f4e96 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -1237,14 +1237,18 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf; #endif - return sycl::isinf(x); + return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan; #endif - return sycl::isnan(x); + return isnan (x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const val_type x) { return ::fabs (x); From 0c04e887bbc49b6fbb6c0e7d77fc4929787df56e Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 5 Apr 2021 15:34:27 -0600 Subject: [PATCH 121/126] cm_test_all_sandia: pass compiler argument to update_libs update mayer's armclang module update cmake on white, weaver --- scripts/cm_test_all_sandia | 41 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index 9f46614770..f41cd818bf 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -502,16 +502,16 @@ elif [ "$MACHINE" = "white" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="cmake/3.18.0,/" - IBM_MODULE_LIST="cmake/3.18.0,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.18.0,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.18.0,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.18.0,/,netlib/3.8.0/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.18.0,/,openblas/0.3.4/gcc/7.4.0" - CUDA_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" - IBM_MODULE_TPL_LIST="cmake/3.18.0,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,netlib/3.8.0/gcc/7.2.0" + GCC74_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.3.4/gcc/7.4.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" + IBM_MODULE_TPL_LIST="cmake/3.19.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -534,7 +534,8 @@ elif [ "$MACHINE" = "white" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" @@ -555,14 +556,14 @@ elif [ "$MACHINE" = "weaver" ]; then eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - BASE_MODULE_LIST="cmake/3.18.0,/" - IBM_MODULE_LIST="cmake/3.18.0,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.18.0,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.18.0,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.18.0,/,openblas/0.2.20/gcc/7.2.0" - CUDA_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.18.0,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.2.20/gcc/7.2.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS @@ -630,7 +631,7 @@ elif [ "$MACHINE" = "mayer" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/20.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") + "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=ARMV8_THUNDERX2" @@ -1109,8 +1110,8 @@ setup_env() { done if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then - echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE" - source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE + echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler" + source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler fi return 0 From 43ffb9c09504660eafdead145a5206081852562a Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Wed, 7 Apr 2021 16:43:38 -0600 Subject: [PATCH 122/126] Add static_assert/throw in batched eigendecomp so users know it's not supported yet. Disable the unit tests for it. Serial eigendecomposition on host with LAPACK still works. --- .../KokkosBatched_Eigendecomposition_Serial_Internal.hpp | 9 ++++++++- ...kosBatched_Eigendecomposition_TeamVector_Internal.hpp | 4 ++++ .../batched/Test_Batched_SerialEigendecomposition.hpp | 2 ++ .../Test_Batched_SerialEigendecomposition_Real.hpp | 3 ++- .../Test_Batched_TeamVectorEigendecomposition.hpp | 2 ++ .../Test_Batched_TeamVectorEigendecomposition_Real.hpp | 4 ++-- 6 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index 07ca8933cf..f46a278e8b 100644 --- a/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -57,6 +57,7 @@ namespace KokkosBatched { RealType * w, const int wlen) { /// until debugging is done, comment out the code /// testing happens only for TPLs on host. + static_assert(false, "Serial eigendecomposition on device and/or without LAPACK is not implemented yet"); // typedef RealType real_type; // typedef Kokkos::Details::ArithTraits ats; @@ -356,9 +357,12 @@ namespace KokkosBatched { RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - if (as0 == 1 || as1 == 1) { + //if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host /// potentially it can run tpls internally + // NOTE BMK: If LAPACK not enabled, this will static_assert. + // If neither stride is unit, will runtime assert. + // Otherwise will succeed using LAPACK. host_invoke(m, A, as0, as1, er, ers, @@ -366,6 +370,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + /* } else { /// arbitrary strides should be handled by native implementation device_invoke(m, @@ -375,7 +380,9 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("Serial eigendecomposition without unit stride implemented yet."); } + */ #else /// device code runs device_invoke(m, diff --git a/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index b63d847786..7c4026d1e9 100644 --- a/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -76,6 +76,8 @@ namespace KokkosBatched { RealType * UL, const int uls0, const int uls1, RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { + static_assert(false, "TeamVector eigendecomposition is not implemented yet."); + /* #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host @@ -100,6 +102,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("TeamVector eigendecomposition is not implemented yet."); } #else /// device code runs @@ -111,6 +114,7 @@ namespace KokkosBatched { UR, urs0, urs1, w, wlen); #endif +*/ return 0; } }; diff --git a/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp b/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp index 6ee4ebef18..b63998b75f 100644 --- a/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp +++ b/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -111,3 +112,4 @@ int test_batched_serial_eigendecomposition() { return 0; } +*/ diff --git a/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp b/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp index 7108f56bbb..344438e719 100644 --- a/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_float ) { test_batched_serial_eigendecomposition(); @@ -9,5 +10,5 @@ TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) { test_batched_serial_eigendecomposition(); } #endif - +*/ diff --git a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp index 9dd2a6b048..a02c701acd 100644 --- a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -114,3 +115,4 @@ int test_batched_teamvector_eigendecomposition() { return 0; } +*/ diff --git a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp index 14b3c61f4d..b4646c3027 100644 --- a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_float ) { test_batched_teamvector_eigendecomposition(); @@ -9,5 +10,4 @@ TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_double ) { test_batched_teamvector_eigendecomposition(); } #endif - - +*/ From 3341296071c94d3b101e275b5c2a172ce2b9ac5d Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Tue, 13 Apr 2021 17:46:19 -0600 Subject: [PATCH 123/126] Kokkos_ArithTraits.hpp: Fix isInf and isNan with complex types Detected during release 3.4.0 testing with Trilinos --- src/Kokkos_ArithTraits.hpp | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index c8a20f4e96..f96ffc49c3 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -856,7 +856,7 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL using sycl::isinf @@ -864,7 +864,7 @@ class ArithTraits { return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL using sycl::isnan @@ -1034,6 +1034,7 @@ class ArithTraits > { return std::complex (ArithTraits::infinity (), ArithTraits::infinity ()); } +#ifdef KOKKOS_ENABLE_SYCL template static bool isInf(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST @@ -1043,13 +1044,20 @@ class ArithTraits > { #endif return isinf (real (x)) || isinf (imag (x)); } -#ifdef KOKKOS_ENABLE_SYCL template <> static bool isInf(const std::complex& x) { Kokkos::abort("isInf not available for std::complex!\n"); return true; } +#else + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#endif + return isinf (real (x)) || isinf (imag (x)); + } #endif +#ifdef KOKKOS_ENABLE_SYCL template static bool isNan(const std::complex& x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST @@ -1059,12 +1067,18 @@ class ArithTraits > { #endif return isnan (real (x)) || isnan (imag (x)); } -#ifdef KOKKOS_ENABLE_SYCL template <> static bool isNan(const std::complex& x) { Kokkos::abort("isNan not available for std::complex!\n"); return true; } +#else + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#endif + return isnan (real (x)) || isnan (imag (x)); + } #endif static mag_type abs (const std::complex& x) { return std::abs (x); From dd0d4ef84ce662ea313ff3de22fe0c0e12077a17 Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Sun, 25 Apr 2021 23:42:46 -0600 Subject: [PATCH 124/126] Adding Changelog for Release 3.4.00 Part of Kokkos C++ Performance Portability Programming EcoSystem 3.4 --- CHANGELOG.md | 35 +++++++++++++++++++++++++++++++++++ CMakeLists.txt | 2 +- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51e31ef007..27b2fccc41 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos-kernels/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00) + +**Features:** +- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos/pull/924) + +**Implemented enhancements Algorithms and Archs:** +- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos/pull/921) +- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos/pull/899) +- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos/pull/895) +- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos/pull/893) + +**Implemented enhancements BuildSystem:** +- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos/pull/901) +- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos/pull/880) +- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos/pull/878) + +**Implemented enhancements Other:** +- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos/pull/931) +- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos/pull/925) +- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos/pull/892) + +**Fixed bugs:** +- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos/pull/918) +- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos/pull/915) +- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos/pull/910) +- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos/pull/898) +- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos/pull/894) +- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos/pull/885) +- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos/pull/884) +- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos/pull/883) +- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos/pull/882) +- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos/pull/872) +- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos/pull/871) + ## [3.3.00](https://github.com/kokkos/kokkos-kernels/tree/3.3.00) (2020-12-16) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.2.01...3.3.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index eb9211d354..1f698db668 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 3) + SET(KokkosKernels_VERSION_MINOR 4) SET(KokkosKernels_VERSION_PATCH 0) ENDIF() From d3c33910bb546f4c16e5166215cb887e155e6bb7 Mon Sep 17 00:00:00 2001 From: Luc Berger Date: Mon, 26 Apr 2021 10:47:39 -0600 Subject: [PATCH 125/126] Update CHANGELOG.md --- CHANGELOG.md | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 911bb32197..187d99d376 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,36 +4,36 @@ [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00) **Features:** -- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos/pull/924) +- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos-kernels/pull/924) **Implemented enhancements Algorithms and Archs:** -- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos/pull/921) -- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos/pull/899) -- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos/pull/895) -- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos/pull/893) +- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos-kernels/pull/921) +- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos-kernels/pull/899) +- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos-kernels/pull/895) +- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos-kernels/pull/893) **Implemented enhancements BuildSystem:** -- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos/pull/901) -- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos/pull/880) -- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos/pull/878) +- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos-kernels/pull/901) +- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos-kernels/pull/880) +- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos-kernels/pull/878) **Implemented enhancements Other:** -- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos/pull/931) -- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos/pull/925) -- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos/pull/892) +- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos-kernels/pull/931) +- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos-kernels/pull/925) +- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos-kernels/pull/892) **Fixed bugs:** -- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos/pull/918) -- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos/pull/915) -- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos/pull/910) -- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos/pull/898) -- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos/pull/894) -- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos/pull/885) -- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos/pull/884) -- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos/pull/883) -- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos/pull/882) -- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos/pull/872) -- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos/pull/871) +- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos-kernels/pull/918) +- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos-kernels/pull/915) +- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos-kernels/pull/910) +- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos-kernels/pull/898) +- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos-kernels/pull/894) +- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos-kernels/pull/885) +- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos-kernels/pull/884) +- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos-kernels/pull/883) +- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos-kernels/pull/882) +- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos-kernels/pull/872) +- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos-kernels/pull/871) ## [3.3.01](https://github.com/kokkos/kokkos-kernels/tree/3.3.01) (2021-01-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.00...3.3.01) From dc871171b862ab740119281e96a62d90d9bff9ba Mon Sep 17 00:00:00 2001 From: Nathan Ellingwood Date: Mon, 26 Apr 2021 12:08:24 -0600 Subject: [PATCH 126/126] Update master_history for Kokkos 3.4.0 --- master_history.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/master_history.txt b/master_history.txt index a113e3619f..022e459733 100644 --- a/master_history.txt +++ b/master_history.txt @@ -12,3 +12,4 @@ tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 tag: 3.2.00 date: 08/19/2020 master: 07a60bcc release: ea3f2b77 tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55 tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b +tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910