diff --git a/.jenkins/nightly.groovy b/.jenkins/nightly.groovy new file mode 100644 index 0000000000..41e4daf71e --- /dev/null +++ b/.jenkins/nightly.groovy @@ -0,0 +1,41 @@ +pipeline { + agent none + + stages { + stage('HIP-ROCm-3.10-C++14') { + agent { + dockerfile { + filename 'Dockerfile.hip' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:3.10' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh '''rm -rf kokkos && + git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkos_ARCH_VEGA906=ON \ + .. && \ + make -j8 && make install && \ + cd ../.. && rm -rf kokkos''' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES=ON \ + -DKokkos_ENABLE_HIP=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + .. && \ + make -j8 && ctest --verbose''' + } + } + } +} diff --git a/BUILD.md b/BUILD.md index 19ea0fd573..023cf96f4e 100644 --- a/BUILD.md +++ b/BUILD.md @@ -125,6 +125,12 @@ endif() * CUSPARSE_LIBRARY_DIRS: STRING * Optional override for the library directories that comprise TPL CUSPARSE. * Default: None. Default common library locations will be searched +* ARMPL_LIBRARIES: STRING + * Optional override for the libraries that comprise TPL ARMPL. + * Default: None. Default common library names will be searched +* ARMPL_LIBRARY_DIRS: STRING + * Optional override for the library directories that comprise TPL ARMPL. + * Default: None. Default common library locations will be searched * KokkosKernels_BLAS_ROOT: PATH * Location of BLAS install root. * Default: None or the value of the environment variable BLAS_ROOT if set @@ -161,6 +167,9 @@ endif() * KokkosKernels_ENABLE_TPL_MKL: BOOL * Whether to enable MKL * Default: OFF +* KokkosKernels_ENABLE_TPL_ARMPL: BOOL + * Whether to enable ARMPL + * Default: OFF * KokkosKernels_ETI_ONLY: BOOL * Whether to restrict availability of kernels to ETI types only. Turning this on guarantees that kernels are never built inside of object files which simply call KokkosKernels functions. * Default: OFF diff --git a/CHANGELOG.md b/CHANGELOG.md index 05b18d1c86..187d99d376 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos-kernels/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.01...3.4.00) + +**Features:** +- SYCL: adding ETI and CMake logic for SYCL backend [\#924](https://github.com/kokkos/kokkos-kernels/pull/924) + +**Implemented enhancements Algorithms and Archs:** +- Two-stage GS: add damping factors [\#921](https://github.com/kokkos/kokkos-kernels/pull/921) +- Supernodal SpTRSV, improve symbolic performance [\#899](https://github.com/kokkos/kokkos-kernels/pull/899) +- Add MKL SpMV wrapper [\#895](https://github.com/kokkos/kokkos-kernels/pull/895) +- Serial code path for spmv [\#893](https://github.com/kokkos/kokkos-kernels/pull/893) + +**Implemented enhancements BuildSystem:** +- Cmake: Update ArmPL support [\#901](https://github.com/kokkos/kokkos-kernels/pull/901) +- Cmake: Add ARMPL TPL support [\#880](https://github.com/kokkos/kokkos-kernels/pull/880) +- IntelClang guarding __assume_aligned with !defined(__clang__) [\#878](https://github.com/kokkos/kokkos-kernels/pull/878) + +**Implemented enhancements Other:** +- Add static_assert/throw in batched eigendecomp [\#931](https://github.com/kokkos/kokkos-kernels/pull/931) +- Workaround using new/delete in kernel code [\#925](https://github.com/kokkos/kokkos-kernels/pull/925) +- Blas perf_test updates [\#892](https://github.com/kokkos/kokkos-kernels/pull/892) + +**Fixed bugs:** +- Fix ctor CrsMat mirror with CrsGraph mirror [\#918](https://github.com/kokkos/kokkos-kernels/pull/918) +- Fix nrm1, removed cublas nrminf, improved blas tests [\#915](https://github.com/kokkos/kokkos-kernels/pull/915) +- Fix and testing coverage mainly in graph coarsening [\#910](https://github.com/kokkos/kokkos-kernels/pull/910) +- Fix KokkosSparse for nightly test failure [\#898](https://github.com/kokkos/kokkos-kernels/pull/898) +- Fix view types across ternary operator [\#894](https://github.com/kokkos/kokkos-kernels/pull/894) +- Make work_view_t typedef consistent [\#885](https://github.com/kokkos/kokkos-kernels/pull/885) +- Fix supernodal SpTRSV build with serial+openmp+cuda [\#884](https://github.com/kokkos/kokkos-kernels/pull/884) +- Construct SpGEMM C with correct ncols [\#883](https://github.com/kokkos/kokkos-kernels/pull/883) +- Matrix Converter: fixing issue with deallocation after Kokkos::fininalize [\#882](https://github.com/kokkos/kokkos-kernels/pull/882) +- Fix >1024 team size error in sort_crs_* [\#872](https://github.com/kokkos/kokkos-kernels/pull/872) +- Fixing seg fault with empty matrix in kspiluk [\#871](https://github.com/kokkos/kokkos-kernels/pull/871) + ## [3.3.01](https://github.com/kokkos/kokkos-kernels/tree/3.3.01) (2021-01-18) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.3.00...3.3.01) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b9000cddb..1f698db668 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 3) - SET(KokkosKernels_VERSION_PATCH 1) + SET(KokkosKernels_VERSION_MINOR 4) + SET(KokkosKernels_VERSION_PATCH 0) ENDIF() IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") @@ -196,6 +196,10 @@ ELSE() MESSAGE("") # Skip building Kokkos Kernels if we are doing an installation test ADD_SUBDIRECTORY(src) + IF(KokkosKernels_ENABLE_INSTALL_TEST) + ADD_SUBDIRECTORY(install_test) + MESSAGE("The install test has been enabled, you will need to peform: make install before running the tests otherwise install_test will fail") + ENDIF() KOKKOSKERNELS_ADD_TEST_DIRECTORIES(test_common) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(perf_test) KOKKOSKERNELS_ADD_TEST_DIRECTORIES(unit_test) diff --git a/CheckHostBlasReturnComplex.cmake b/CheckHostBlasReturnComplex.cmake index 78ae33515b..30063b1cc3 100644 --- a/CheckHostBlasReturnComplex.cmake +++ b/CheckHostBlasReturnComplex.cmake @@ -5,7 +5,12 @@ FUNCTION(CHECK_HOST_BLAS_RETURN_COMPLEX VARNAME) IF (KOKKOSKERNELS_HAS_TRILINOS) SET(CMAKE_REQUIRED_LIBRARIES ${TPL_BLAS_LIBRARIES}) ELSE() - SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + # For TPLs, just pull out the required libraries from the target properies. + IF (KOKKOSKERNELS_ENABLE_TPL_ARMPL) + GET_TARGET_PROPERTY(CMAKE_REQUIRED_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + ELSE() + SET(CMAKE_REQUIRED_LIBRARIES ${BLAS_LIBRARIES}) + ENDIF() ENDIF() SET(SOURCE diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index bb246df3c6..bb9913b05b 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -225,6 +225,8 @@ display_help_text() { echo "" echo "--with-cuda[=/Path/To/Cuda]: Enable Cuda and set path to Cuda Toolkit." echo "--with-hip[=/Path/To/Hip]: Enable Hip and set path to ROCM Toolkit." + echo "--with-openmptarget: Enable OpenMPTarget backend." + echo "--with-sycl: Enable Sycl backend." echo "--with-openmp: Enable OpenMP backend." echo "--with-pthread: Enable Pthreads backend." echo "--with-serial: Enable Serial backend." @@ -313,7 +315,7 @@ display_help_text() { echo "--with-tpls=[TPLS]: Set tpls to be instantiated (Proper support requies that appropriate compiler and device must be enabled)." echo " This may require providing paths and the library name if using custom installs not on a default path" echo " that CMake searches" - echo " Options: blas, mkl, cublas, cusparse, magma" + echo " Options: blas, mkl, cublas, cusparse, magma, armpl" echo "--user-blas-path=[PATH]: Set path to location of user-specified BLAS library." echo "--user-blas-lib=[LIB]: Library name of desired BLAS install." echo " Example: For the typical \"libblas.a\" provide \"blas\"" @@ -396,6 +398,12 @@ do --with-openmp) update_kokkos_devices OpenMP ;; + --with-openmptarget) + update_kokkos_devices OpenMPTarget + ;; + --with-sycl) + update_kokkos_devices Sycl + ;; --with-pthread) update_kokkos_devices Pthread ;; @@ -569,7 +577,7 @@ done if [ "$KOKKOS_CXX_STANDARD" == "" ]; then STANDARD_CMD= else - STANDARD_CMD=-DKokkos_CXX_STANDARD=${KOKKOS_CXX_STANDARD} + STANDARD_CMD=-DCMAKE_CXX_STANDARD=${KOKKOS_CXX_STANDARD} fi if [ "$COMPILER" == "" ]; then diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 66990dd126..0aa97b1d6c 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,5 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA SUPERLU CHOLMOD LAPACKE CBLAS ARMPL TEST_OPTIONAL_TPLS yaml-cpp ) diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index 31d77bda94..6b95ff91ae 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -12,6 +12,7 @@ find_dependency(Kokkos HINTS @Kokkos_DIR@) SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) +SET(Kokkos_ENABLE_SYCL @Kokkos_ENABLE_SYCL@) SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index c0a1e98ec6..9326edc47a 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -37,6 +37,10 @@ /* Whether to build kernels for execution space Kokkos::Experimental::HIP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_HIP #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HIPSPACE +/* Whether to build kernels for execution space Kokkos::Experimental::SYCL */ +#cmakedefine KOKKOSKERNELS_INST_EXECSPACE_SYCL +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE +#cmakedefine KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE /* Whether to build kernels for execution space Kokkos::OpenMP */ #cmakedefine KOKKOSKERNELS_INST_EXECSPACE_OPENMP /* Whether to build kernels for execution space Kokkos::Threads */ @@ -102,11 +106,14 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_LAPACKE /* METIS */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_METIS +/* ARMPL */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ARMPL #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV -/* if MKL, BLAS is also defined */ -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) +/* if MKL or ARMPL, BLAS is also defined */ +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\ + defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) #define KOKKOSKERNELS_ENABLE_TPL_BLAS #endif diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake new file mode 100644 index 0000000000..62e1e33ea3 --- /dev/null +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -0,0 +1,47 @@ +# Both the armpl_mp and armpl libraries define the same public symbol names. +# In order to link against the openmp armpl symbols, instruct cmake to link against armpl_mp. +# In order to link against the default armpl symbols, instruct cmake to link against armpl. +IF(KOKKOSKERNELS_INST_EXECSPACE_OPENMP) + SET(ARMPL_LIB armpl_mp) +ELSE() + SET(ARMPL_LIB armpl) +ENDIF() + +IF (ARMPL_LIBRARY_DIRS AND ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (ARMPL_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES ${ARMPL_LIBRARIES}) +ELSEIF (ARMPL_LIBRARY_DIRS) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) +ELSEIF (DEFINED ENV{ARMPL_DIR}) + SET(ARMPL_ROOT $ENV{ARMPL_DIR}) + KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE + LIBRARIES + amath + ${ARMPL_LIB} + LIBRARY_PATHS + ${ARMPL_ROOT}/lib + HEADERS + armpl.h + HEADER_PATHS + ${ARMPL_ROOT}/include + ) +ELSE() + FIND_PACKAGE(ARMPL REQUIRED) + KOKKOSKERNELS_CREATE_IMPORTED_TPL(ARMPL INTERFACE LINK_LIBRARIES ${ARMPL_LIBRARIES}) +ENDIF() + +TRY_COMPILE(KOKKOSKERNELS_TRY_COMPILE_ARMPL + ${KOKKOSKERNELS_TOP_BUILD_DIR}/tpl_tests + ${KOKKOSKERNELS_TOP_SOURCE_DIR}/cmake/compile_tests/armpl.cpp + LINK_LIBRARIES -l${ARMPL_LIB} -lgfortran -lamath -lm + OUTPUT_VARIABLE KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT) +IF(NOT KOKKOSKERNELS_TRY_COMPILE_ARMPL) + MESSAGE(FATAL_ERROR "KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT=${KOKKOSKERNELS_TRY_COMPILE_ARMPL_OUT}") +ELSE() + # KokkosKernels::ARMPL is an alias to the ARMPL target. + # Let's add in the libgfortran and libm dependencies for users here. + GET_TARGET_PROPERTY(ARMPL_INTERFACE_LINK_LIBRARIES KokkosKernels::ARMPL INTERFACE_LINK_LIBRARIES) + SET(ARMPL_INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES};-lgfortran;-lm") + SET_TARGET_PROPERTIES(ARMPL PROPERTIES INTERFACE_LINK_LIBRARIES "${ARMPL_INTERFACE_LINK_LIBRARIES}") +ENDIF() diff --git a/cmake/compile_tests/armpl.cpp b/cmake/compile_tests/armpl.cpp new file mode 100644 index 0000000000..9bb1c48392 --- /dev/null +++ b/cmake/compile_tests/armpl.cpp @@ -0,0 +1,5 @@ +#include + +int main(void) { + return 0; +} diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index c2f46bb8e3..eb7d8602b7 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -11,6 +11,7 @@ ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) +CHECK_KOKKOS_BACKEND(SYCL) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(PTHREAD) CHECK_KOKKOS_BACKEND(SERIAL) diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index ede934023c..ad7ef15e55 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -5,12 +5,14 @@ SET(EXEC_SPACES EXECSPACE_CUDA EXECSPACE_HIP + EXECSPACE_SYCL EXECSPACE_OPENMP EXECSPACE_PTHREAD EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) +SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) @@ -19,14 +21,18 @@ SET(MEM_SPACES MEMSPACE_CUDASPACE MEMSPACE_CUDAUVMSPACE MEMSPACE_HIPSPACE + MEMSPACE_SYCLSPACE + MEMSPACE_SYCLSHAREDSPACE MEMSPACE_HOSTSPACE MEMSPACE_HBWSPACE ) -SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) -SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) -SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) -SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) -SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) +SET(MEMSPACE_CUDASPACE_CPP_TYPE Kokkos::CudaSpace) +SET(MEMSPACE_CUDAUVMSPACE_CPP_TYPE Kokkos::CudaUVMSpace) +SET(MEMSPACE_HIPSPACE_CPP_TYPE Kokkos::Experimental::HIPSpace) +SET(MEMSPACE_SYCLSPACE_CPP_TYPE Kokkos::Experimental::SYCLDeviceUSMSpace) +SET(MEMSPACE_SYCLSHAREDSPACE_CPP_TYPE Kokkos::Experimental::SYCLSharedUSMSpace) +SET(MEMSPACE_HOSTSPACE_CPP_TYPE Kokkos::HostSpace) +SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( @@ -85,6 +91,33 @@ IF(KOKKOS_ENABLE_HIP) ENDIF() +IF(KOKKOS_ENABLE_SYCL) + KOKKOSKERNELS_ADD_OPTION( + INST_EXECSPACE_SYCL + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the execution space Kokkos::Experimental::SYCL. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + KOKKOSKERNELS_ADD_OPTION( + INST_MEMSPACE_SYCLSPACE + ${KOKKOSKERNELS_INST_EXECSPACE_SYCL_DEFAULT} + BOOL + "Whether to pre instantiate kernels for the memory space Kokkos::Experimental::SYCLSpace. Disabling this when Kokkos_ENABLE_SYCL is enabled may increase build times. Default: ON if Kokkos is SYCL-enabled, OFF otherwise." + ) + + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + IF(KOKKOSKERNELS_INST_EXECSPACE_SYCL AND KOKKOSKERNELS_INST_MEMSPACE_SYCLSHAREDSPACE) + LIST(APPEND DEVICE_LIST "") + ENDIF() + + IF( Trilinos_ENABLE_COMPLEX_DOUBLE AND ((NOT DEFINED CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS) OR (NOT CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS)) ) + MESSAGE( WARNING "The CMake option CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS is either undefined or OFF. Please set CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS:BOOL=ON when building with SYCL and complex double enabled.") + ENDIF() + +ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_HOSTSPACE ${KOKKOSKERNELS_ADD_DEFAULT_ETI} @@ -138,6 +171,7 @@ KOKKOSKERNELS_ADD_OPTION( SET(EXECSPACE_CUDA_VALID_MEM_SPACES CUDASPACE CUDAUVMSPACE) SET(EXECSPACE_HIP_VALID_MEM_SPACES HIPSPACE) +SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) diff --git a/cmake/kokkoskernels_features.cmake b/cmake/kokkoskernels_features.cmake index 2212332b7d..6f4561f664 100644 --- a/cmake/kokkoskernels_features.cmake +++ b/cmake/kokkoskernels_features.cmake @@ -24,7 +24,7 @@ KOKKOSKERNELS_FEATURE_DEPENDS_ON_TPLS( # Fortran Complex BLAS # ================================================================== -IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) +IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) INCLUDE(CheckHostBlasReturnComplex.cmake) CHECK_HOST_BLAS_RETURN_COMPLEX(KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX) ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 08230dd987..2bdcda1e81 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -420,11 +420,12 @@ KOKKOSKERNELS_ADD_TPL_OPTION(MKL OFF "Whether to enable MKL") KOKKOSKERNELS_ADD_TPL_OPTION(MAGMA OFF "Whether to enable MAGMA") KOKKOSKERNELS_ADD_TPL_OPTION(CBLAS OFF "Whether to enable CBLAS") KOKKOSKERNELS_ADD_TPL_OPTION(LAPACKE OFF "Whether to enable LAPACKE") +KOKKOSKERNELS_ADD_TPL_OPTION(ARMPL OFF "Whether to enable ARMPL") # Set F77_BLAS_MANGLE macro based on Fortran-C interface (unless already set # by Trilinos or user) IF ("${F77_BLAS_MANGLE}" STREQUAL "") - IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA) + IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL OR KOKKOSKERNELS_ENABLE_TPL_MAGMA OR KOKKOSKERNELS_ENABLE_TPL_ARMPL) ENABLE_LANGUAGE(C) ENABLE_LANGUAGE(Fortran) INCLUDE(FortranCInterface) @@ -481,6 +482,7 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(CHOLMOD) KOKKOSKERNELS_IMPORT_TPL(SUPERLU) KOKKOSKERNELS_IMPORT_TPL(METIS) + KOKKOSKERNELS_IMPORT_TPL(ARMPL) ENDIF() #Convert list to newlines (which CMake doesn't always like in cache variables) diff --git a/cmake/kokkoskernels_tribits.cmake b/cmake/kokkoskernels_tribits.cmake index 4eebb97c7b..b023d7c4d2 100644 --- a/cmake/kokkoskernels_tribits.cmake +++ b/cmake/kokkoskernels_tribits.cmake @@ -149,12 +149,13 @@ IF (IS_ENABLED) IF (KOKKOSKERNELS_HAS_TRILINOS) TRIBITS_ADD_EXECUTABLE(${EXE_NAME} SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${TESTONLYLIBS}) + TESTONLYLIBS ${PARSE_TESTONLYLIBS}) ELSE() ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} ${PARSE_TESTONLYLIBS}) + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels ${PARSE_TESTONLYLIBS}) + ELSE () + TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkoskernels) ENDIF() ENDIF() ELSE() diff --git a/example/buildlib/compileKokkosKernels.sh b/example/buildlib/compileKokkosKernels.sh index 9f5978bb58..4bf57ce316 100755 --- a/example/buildlib/compileKokkosKernels.sh +++ b/example/buildlib/compileKokkosKernels.sh @@ -1,13 +1,22 @@ -KOKKOS_PATH=${HOME}/work/kokkos #path to kokkos source -KOKKOSKERNELS_SCALARS='double,"complex"' #the scalar types to instantiate =double,float... -KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. -KOKKOSKERNELS_ORDINALS=int,long #ordinal types to instantiate -KOKKOSKERNELS_OFFSETS=int,size_t #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels +#!/bin/bash +# Requires cmake version > 3.12 +# Paths to source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory + +# Compiler - must be passed to kokkos and kokkos-kernels configurations +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #Options: icpc #g++ #clang++ CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized" -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # -KOKKOS_DEVICES=Serial,Cuda,OpenMP #devices Cuda... -KOKKOS_ARCHS=Pascal60,Power8 -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +# Configure Kokkos (Unit Tests OFF) - Makefile located in kokkos-build +cmake -Bkokkos-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ARCH_PASCAL60=ON -DKokkos_ARCH_POWER8=ON -DKokkos_ENABLE_CUDA=ON -DKokkos_ENABLE_SERIAL=ON -DKokkos_ENABLE_OPENMP=ON -DKokkos_ENABLE_CUDA_LAMBDA=ON -DCMAKE_CXX_FLAGS="${CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PWD}/kokkos-install" -DKokkos_ENABLE_TESTS=OFF ${KOKKOS_PATH} + +# Build and Install Kokkos - install lib at ${PWD}/kokkos-install +cmake --build kokkos-build -j 8 --target install + + +# Configure KokkosKernels (Unit Tests OFF) - Makefile located in kokkoskernels-build +cmake -Bkokkoskernels-build -DCMAKE_CXX_COMPILER=${CXX} -DKokkos_ROOT="${PWD}/kokkos-install" -DKokkosKernels_INST_DOUBLE=ON -DKokkosKernels_INST_COMPLEX_DOUBLE=ON -DKokkosKernels_INST_ORDINAL_INT=ON -DKokkosKernels_INST_ORDINAL_INT64_T=ON -DKokkosKernels_INST_OFFSET_INT=ON -DKokkosKernels_INST_OFFSET_SIZE_T=ON -DKokkosKernels_INST_LAYOUTLEFT=ON -DKokkosKernels_ADD_DEFAULT_ETI=ON -DCMAKE_INSTALL_PREFIX="${PWD}/kokkoskernels-install" -DKokkosKernels_ENABLE_TESTS=OFF -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF ${KOKKOSKERNELS_PATH} + +# Build and Install KokkosKernels - install lib at ${PWD}/kokkoskernels-install +cmake --build kokkoskernels-build -j 8 --target install diff --git a/example/buildlib/compileKokkosKernelsSimple.sh b/example/buildlib/compileKokkosKernelsSimple.sh index 20d0a7aef4..9502235aba 100755 --- a/example/buildlib/compileKokkosKernelsSimple.sh +++ b/example/buildlib/compileKokkosKernelsSimple.sh @@ -1,13 +1,17 @@ -KOKKOS_PATH=${HOME}/proj/kokkos #path to kokkos source +KOKKOS_PATH="${HOME}/Kokkos/kokkos" #path to kokkos source +KOKKOSKERNELS_PATH="../.." #path to kokkos-kernels top directory. + KOKKOSKERNELS_SCALARS=double #the scalar types to instantiate =double,float... KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. KOKKOSKERNELS_ORDINALS=int #ordinal types to instantiate KOKKOSKERNELS_OFFSETS=int #offset types to instantiate -KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels -KOKKOS_DEVICES=Cuda # other devices Cuda,Serial .. +KOKKOS_DEVICES=Cuda KOKKOS_ARCHS=SKX,Volta70 +KOKKOS_CUDA_OPTIONS=enable_lambda CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wtype-limits -Wuninitialized" -../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" +../../cm_generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" + +# Call "../../scripts/cm_generate_makefile.bash --help" for options diff --git a/install_test/CMakeLists.txt b/install_test/CMakeLists.txt new file mode 100644 index 0000000000..4be641e87a --- /dev/null +++ b/install_test/CMakeLists.txt @@ -0,0 +1,15 @@ +# First copy the CMakeList.txt so we can build the test +configure_file(${PACKAGE_SOURCE_DIR}/install_test/CMakeLists.txt.in ${CMAKE_CURRENT_BINARY_DIR}/source/CMakeLists.txt) + +# Second copy the source files needed to the build area +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_pcg.hpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) +file(COPY ${PACKAGE_SOURCE_DIR}/perf_test/sparse/KokkosSparse_block_pcg.cpp DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/source) + +# Third write a configure file that can be invoked to test the library installation +configure_file(${PACKAGE_SOURCE_DIR}/install_test/run_install_test.sh.in ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh @ONLY) + +# Fourth create the build directory where the installation of the cg example will take place +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/build) + +add_test(NAME install_test COMMAND /bin/bash ${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh) +# KOKKOSKERNELS_ADD_TEST(NAME "install_test" COMMAND "${CMAKE_CURRENT_BINARY_DIR}/run_install_test.sh" COMPONENTS Sparse) diff --git a/install_test/CMakeLists.txt.in b/install_test/CMakeLists.txt.in new file mode 100644 index 0000000000..74605ac73f --- /dev/null +++ b/install_test/CMakeLists.txt.in @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.13) +project(kokkoskernels_install_test CXX) + +include(CTest) + +find_package(KokkosKernels REQUIRED) + +add_executable(kokkoskernels_install_test KokkosSparse_block_pcg.cpp) +target_link_libraries(kokkoskernels_install_test PRIVATE Kokkos::kokkoskernels) + +if(BUILD_TESTING) + + add_test(NAME cg_test_serial COMMAND kokkoskernels_install_test --mtx auto --serial) + + if(KOKKOS_ENABLE_THREADS) + add_test(NAME cg_test_threads COMMAND kokkoskernels_install_test --mtx auto --threads 2) + endif() + + if(KOKKOS_ENABLE_OPENMP) + add_test(NAME cg_test_openmp COMMAND kokkoskernels_install_test --mtx auto --openmp 2) + endif() + + if(KOKKOS_ENABLE_CUDA) + add_test(NAME cg_test_cuda COMMAND kokkoskernels_install_test --mtx auto --cuda) + endif() + + if(KOKKOS_ENABLE_HIP) + add_test(NAME cg_test_hip COMMAND kokkoskernels_install_test --mtx auto --hip) + endif() + +endif() diff --git a/install_test/run_install_test.sh.in b/install_test/run_install_test.sh.in new file mode 100755 index 0000000000..a3b0fd6a59 --- /dev/null +++ b/install_test/run_install_test.sh.in @@ -0,0 +1,37 @@ +#!/bin/bash + +KOKKOSKERNELS_INTALL="@CMAKE_BINARY_DIR@" +INSTALL_TEST_SOURCE="@CMAKE_CURRENT_BINARY_DIR@/source" +INSTALL_TEST_BUILD="@CMAKE_CURRENT_BINARY_DIR@/build" + +cd "${INSTALL_TEST_BUILD}" +rm -rf CMake* + +cmake "${INSTALL_TEST_SOURCE}" \ + -D CMAKE_CXX_COMPILER="@CMAKE_CXX_COMPILER@" \ + -D KokkosKernels_ROOT:PATH="@CMAKE_BINARY_DIR@/@CMAKE_INSTALL_LIBDIR@/cmake/KokkosKernels" + +if [ $? -eq 0 ]; then + echo "*** install test: cmake configure SUCCESSFUL ***" +else + echo "*** install test: cmake configure FAILED ***" + exit 1; +fi + +make -j 4 + +if [ $? -eq 0 ]; then + echo "*** install test: build SUCCESSFUL ***" +else + echo "*** install test: build FAILED ***" + exit 1; +fi + +ctest -V -R + +if [ $? -eq 0 ]; then + echo "*** install test: run SUCCESSFUL ***" +else + echo "*** install test: run FAILED ***" + exit 1; +fi diff --git a/master_history.txt b/master_history.txt index a113e3619f..022e459733 100644 --- a/master_history.txt +++ b/master_history.txt @@ -12,3 +12,4 @@ tag: 3.1.01 date: 05/04/2020 master: 43773523 release: 6fce7502 tag: 3.2.00 date: 08/19/2020 master: 07a60bcc release: ea3f2b77 tag: 3.3.00 date: 12/16/2020 master: 42defc56 release: e5279e55 tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b +tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910 diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index fe3b3c51ba..08788d648d 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -10,6 +10,22 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) # build correctly with or without MPI, but only run them with a single # MPI process. +SET(GTEST_SOURCE_DIR ${PACKAGE_SOURCE_DIR}/tpls/gtest) + +KOKKOSKERNELS_ADD_TEST_LIBRARY( + kokkoskernelsperf_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc + ) +# Disables pthreads, this is a problem for serial builds in Trilinos & Sierra if it's enabled. +TARGET_COMPILE_DEFINITIONS(kokkoskernelsperf_gtest PUBLIC "-DGTEST_HAS_PTHREAD=0") +TARGET_INCLUDE_DIRECTORIES(kokkoskernelsperf_gtest PUBLIC $) + +#Gtest minimally requires C++11 +TARGET_COMPILE_FEATURES(kokkoskernelsperf_gtest PUBLIC cxx_std_11) + + + ADD_COMPONENT_SUBDIRECTORY(batched) ADD_COMPONENT_SUBDIRECTORY(graph) ADD_COMPONENT_SUBDIRECTORY(sparse) diff --git a/perf_test/batched/README.md b/perf_test/batched/README.md new file mode 100644 index 0000000000..ca5920ae39 --- /dev/null +++ b/perf_test/batched/README.md @@ -0,0 +1 @@ +Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`. diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index 0c0dbe4eac..fc7e727123 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -608,20 +608,12 @@ namespace KokkosBatched { Kokkos::deep_copy(a, amat); Kokkos::deep_copy(b, bmat); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp timer.reset(); Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, functor_type(a, b)); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp const double t = timer.seconds(); tmin = std::min(tmin, t); tavg += (iter >= 0)*t; @@ -693,20 +685,12 @@ namespace KokkosBatched { Kokkos::deep_copy(a, amat); Kokkos::deep_copy(b, bmat); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp timer.reset(); Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, functor_type(a, b)); -<<<<<<< HEAD:perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp - Kokkos::fence(); -======= DeviceSpaceType().fence(); ->>>>>>> develop:perf_test/batched/KokkosBatched_Test_Trsm_Cuda.cpp const double t = timer.seconds(); tmin = std::min(tmin, t); tavg += (iter >= 0)*t; diff --git a/perf_test/blas/blas/KokkosBlas_common.hpp b/perf_test/blas/blas/KokkosBlas_common.hpp index a6f9c65d8b..54e79647bf 100644 --- a/perf_test/blas/blas/KokkosBlas_common.hpp +++ b/perf_test/blas/blas/KokkosBlas_common.hpp @@ -56,6 +56,7 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 10 #define DEFAULT_OUT &std::cout #define DEFAULT_BLAS_ROUTINES "trtri," @@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; diff --git a/perf_test/blas/blas/KokkosBlas_perf_test.cpp b/perf_test/blas/blas/KokkosBlas_perf_test.cpp index 46e89d5abb..803286f266 100644 --- a/perf_test/blas/blas/KokkosBlas_perf_test.cpp +++ b/perf_test/blas/blas/KokkosBlas_perf_test.cpp @@ -57,6 +57,7 @@ static struct option long_options[] = { {"matrix_size_step", required_argument, 0, 's'}, {"warm_up_loop", required_argument, 0, 'w'}, {"iter", required_argument, 0, 'i'}, + {"batch_size", required_argument, 0, 'k'}, {"csv", required_argument, 0, 'c'}, {"routines", required_argument, 0, 'r'}, {"trtri_options", required_argument, 0, 'o'}, @@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() { "(default: %d)\n\n", DEFAULT_N); + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A and B.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_K); + printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( @@ -166,12 +172,16 @@ int main(int argc, char **argv) { /* set default options */ options.test = DEFAULT_TEST; options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; options.start.a.m = DEFAULT_MATRIX_START; options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; options.stop.a.m = DEFAULT_MATRIX_STOP; options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; options.start.b.m = DEFAULT_MATRIX_START; options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; options.stop.b.m = DEFAULT_MATRIX_STOP; options.stop.b.n = DEFAULT_MATRIX_STOP; options.step = DEFAULT_STEP; @@ -182,7 +192,7 @@ int main(int argc, char **argv) { options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options, + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas_perf_test(); return 0; @@ -255,6 +265,11 @@ int main(int argc, char **argv) { case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.stop.a.k = + options.start.b.k = options.stop.b.k = + atoi(optarg); + break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6b7b825a7..d60f15b92b 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -78,6 +78,64 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_TRTRI_ARGS "UU" + /** + * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks + * of the A matrix. a_m subblocks are selected. + */ +static inline double __trtri_impl_flop_count(double a_m, double a_n) { + double flop_count = 0; + double flops_per_div, flops_per_mul, flops_per_add; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_div = 1; + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide. + flops_per_div = 6; + flops_per_mul = 6; + flops_per_add = 2; + } + + for (int i = 0; i < a_m; i++) { + flop_count += flops_per_div; // 1 / A[i,j] + flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS + flop_count += i * flops_per_mul; // SCAL FLOPS + } + + return flop_count; +} + +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __trtri_flop_count(double a_m, double a_n) { + double flops; + double flops_per_mul; + double flops_per_add; + + if (a_m != a_n) { + fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); + exit(255); + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + flops_per_mul = 6; + flops_per_add = 2; + } + + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; + + return flops; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -87,18 +145,25 @@ struct trtri_args { typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = - "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s)"; + "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," + "total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { + double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trtri.trtri_args << "," - << loop_e_str[options.loop] << "," << trtri_args.A.extent(1) + << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1) << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," + << flops << "," + << gflops / average_time + << std::endl; } static void __print_trtri_perf_test_options(options_t options) { @@ -133,19 +198,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -164,19 +236,26 @@ void __do_trtri_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trtri::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -241,16 +320,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBlasTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -287,16 +372,23 @@ void __do_trtri_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_batched_trtri_functor); - Kokkos::fence(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); return; @@ -345,7 +437,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0]; trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1]; - trtri_args.A = vta("trtri_args.A", options.n, dim.a.m, dim.a.n); + trtri_args.A = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n); host_A = Kokkos::create_mirror_view(trtri_args.A); Kokkos::fill_random(trtri_args.A, rand_pool, @@ -355,7 +447,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -367,7 +459,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -378,7 +470,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trtri_args.diag == 'U' || trtri_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -408,8 +500,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trtri_args = __do_setup( options, cur_dims); fn(options, trtri_args); diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index c1e3a117fa..73c094387c 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -4,4 +4,5 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test SOURCES KokkosBlas3_perf_test.cpp + TESTONLYLIBS kokkoskernelsperf_gtest ) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 4952a8e606..2103d0d57e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -61,6 +61,9 @@ #define DEFAULT_BLAS_ROUTINES "trmm,gemm," #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 +#define DEFAULT_USE_AUTO 0 +#define DEFAULT_BATCH_SIZE_LAST_DIM 0 +#define DEFAULT_VERIFY 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -83,6 +86,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; + bool use_auto, batch_size_last_dim; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; @@ -116,13 +120,19 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** * @var BLAS: Run the blas routine through the - * KokkosBlas namespace. + * KokkosBlas namespace. * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the * KokkosBatched namespace. + * @var BATCHED_SERIAL_SIMD{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace using SIMD views. + * @var BATCHED_SERIAL_COMPACT_MKL: Run the serial blas mkl routine through + * the KokkosBatched namespace. * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the - * KokkosBatched namespace. + * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through - * the KokkosBatched namespace. + * the KokkosBatched namespace. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace using SIMD views. * @var EXPERIMENT: Run the blas routine as a custom * experiment. */ @@ -130,19 +140,26 @@ typedef enum TEST { BLAS, BATCHED_SERIAL, BATCHED_SERIAL_BLOCKED, + BATCHED_SERIAL_SIMD, + BATCHED_SERIAL_SIMD_BLOCKED, + BATCHED_SERIAL_COMPACT_MKL, BATCHED_TEAM, BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, BATCHED_TEAM_VECTOR_BLOCKED, + BATCHED_TEAM_SIMD, + BATCHED_TEAM_SIMD_BLOCKED, // ADD MORE TEST TYPES HERE EXPERIMENT, TEST_N } test_e; static std::string test_e_str[TEST_N]{ - "blas", "batched_serial", "batched_serial_blocked", "batched_team", + "blas", "batched_serial", "batched_serial_blocked", "batched_serial_simd", + "batched_serial_simd_blocked", "batched_serial_compact_mkl", "batched_team", "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; @@ -176,6 +193,8 @@ typedef struct matrix_dims matrix_dims_t; * @var out_file: The file to write csv data to. Defaults to stdout. * @var blas_args: Arguments for each supported blas routine. * @var blas_routines: Selects which supported blas routines to test. + * @var verify: Performs verification of the blas routine for each input + * before timing it. */ struct perf_test_options { test_e test; @@ -189,6 +208,7 @@ struct perf_test_options { std::string out_file; blas_args_t blas_args; std::string blas_routines; + bool verify; }; typedef struct perf_test_options options_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f26fbb7287..b9556d1c46 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,6 +56,8 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include "gtest/gtest.h" // EXPECT_NEAR +#include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -70,15 +72,27 @@ void do_gemm_serial_batched_blocked(options_t options); // invocation! void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_serial_simd_batched_parallel(options_t options); +void do_gemm_serial_simd_batched_blocked_parallel(options_t options); +void do_gemm_serial_batched_compact_mkl_parallel(options_t options); void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_team_simd_batched_parallel(options_t options); +void do_gemm_team_simd_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; +struct SerialBatchDim3Tag {}; +struct SerialSimdTag {}; +struct SerialSimdBatchDim3Tag {}; struct TeamTag {}; +struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; +struct TeamVectorBatchDim3Tag {}; +struct TeamSimdTag {}; +struct TeamSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -90,24 +104,52 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial NULL, NULL, // Team NULL, NULL, // TeamVector + NULL, NULL, // TeamSimd NULL // Serial Experiment }, { - NULL, // BLAS - do_gemm_serial_batched_parallel, - do_gemm_serial_batched_blocked_parallel, // Serial + NULL, // BLAS + do_gemm_serial_batched_parallel, // Serial + do_gemm_serial_batched_blocked_parallel, + do_gemm_serial_simd_batched_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; +using view_type_4d = + Kokkos::View; +using view_type_5d = + Kokkos::View; + +// Construct the vector type +using memory_space = typename default_device::execution_space::memory_space; +constexpr int simd_vector_size = + KokkosBatched::DefaultVectorLength::value; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, + simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, + simd_internal_vector_size>; +using vector_view_type_3d = + Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -115,39 +157,124 @@ struct batched_params { }; typedef struct batched_params batched_params_t; +/** + * @brief struct gemm_simd_args encapsulates the data types required + * for allocating and passing a single matrix to the KokkosBatched gemm + * kernels. To invoke gemm on a batch of matrices, three instances of this + * struct are required, one for each matrix, A, B, and C. + * + * @var vec_3d: 3-rank view type used for allocating the underlying data. + * A reference must be kept to this object to ensure the + * data is not free'd by the C++ runtime. + * @var mat_4d: 4-rank view type used for populating the simd view with + random values. + * @var ivec_4d: 4-rank view type used for passing to math kernels. This + * view type is used for leveraging simd instructions on + * both the host and device. + */ +struct gemm_simd_args { + vector_view_type_3d vec_3d; + view_type_4d mat_4d; + internal_vector_view_type_4d ivec_4d; +}; +typedef struct gemm_simd_args gemm_simd_args_t; + +/** + * @brief struct gemm_args are common arguments passed to + * both gemm implementations in the KokkosBlas and KokkosBatched + * namespaces throughout these performance tests. + * + * @var transA: transpose type for A matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var transB: transpose type for B matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var alpha: scalar applied to A matrix. + * @var beta: scalar applied to B matrix. + * @var A: 3-rank view type used in all non-simd tests. + * @var B: 3-rank view type used in all non-simd tests. + * @var C: 3-rank view type used in all non-simd tests. + * @var bp: team_size and vector_length for tests that use + * Kokkos::TeamPolicy. + * @var Av: 3-rank and 4-rank vector view types for simd tests. + * @var Bv: 3-rank and 4-rank vector view types for simd tests. + * @var Cv: 3-rank and 4-rank vector view types for simd tests. + */ struct gemm_args { char transA, transB; default_scalar alpha; default_scalar beta; view_type_3d A, B, C; batched_params_t bp; + // Below are matrices for simd tests + gemm_simd_args_t Av, Bv, Cv; + matrix_dims_t dims; }; typedef struct gemm_args gemm_args_t; static std::string gemm_csv_header_str = - "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" + "algorithm,vector_type,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" "dims,C_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_n * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return (2 + 6) * a_m * b_n * a_n; +} + +static inline std::string __gemm_output_dim_string(options_t options, + matrix_dim_t dim) { + std::string x = "x"; + std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); + + if (options.blas_args.batch_size_last_dim) + return ret + x + std::to_string(dim.k); + else + return std::to_string(dim.k) + x + ret; +} + static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string vtype = internal_vector_type::label(); if (experiment_name) algo_name = std::string(experiment_name); - - options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," - << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size - << "," << gemm_args.bp.vector_len << "," - << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) - << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) - << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) - << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) - << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) - << "," << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," << time_in_seconds / options.n - << std::endl; + if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; + + double flops; + double gflops; + double average_time = time_in_seconds / options.n; + + if (options.verify) return; + + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, + gemm_args.dims.a.n, + gemm_args.dims.b.n); + + gflops = flops / 1e9; + + options.out[0] << algo_name << "," << vtype << "," << options.blas_args.gemm.gemm_args << "," + << static_cast(options.blas_args.gemm.alpha) << "," + << static_cast(options.blas_args.gemm.beta) << "," + << ts << "," << vlen << "," << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n << "," + << flops << "," << gflops / average_time << std::endl; } static void __print_gemm_perf_test_options(options_t options) { @@ -181,21 +308,30 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); - - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } + + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -213,22 +349,31 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm::invoke( - _gemm_args.alpha, A, B, _gemm_args.beta, C); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } + + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); #else @@ -240,8 +385,8 @@ void __do_gemm_serial_batched_template(options_t options, template void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { - char a = gemm_args.transA; - char b = gemm_args.transB; + char a = toupper(gemm_args.transA); + char b = toupper(gemm_args.transB); using N = Trans::NoTranspose; using T = Trans::Transpose; // using C = Trans::ConjTranspose; @@ -272,58 +417,92 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) -template -struct parallel_blas_gemm { +template +struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; - parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { + void operator()(const SerialTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, - svA, svB, gemm_args_.beta, svC); + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } -}; -#endif // !KOKKOS_ENABLE_CUDA -template -void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - Kokkos::Timer timer; - using execution_space = typename device_type::execution_space; - using functor_type = parallel_blas_gemm; - functor_type parallel_blas_gemm_functor(gemm_args); + KOKKOS_INLINE_FUNCTION + void operator()(const SerialBatchDim3Tag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); - STATUS; + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } - Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_gemm_functor); - Kokkos::fence(); + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const int &i) const { + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); - timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopGemm", - Kokkos::RangePolicy(0, n), - parallel_blas_gemm_functor); - Kokkos::fence(); - __gemm_output_csv_row(options, gemm_args, timer.seconds()); -#else - std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; - __gemm_output_csv_row(options, gemm_args, -1); -#endif // !KOKKOS_ENABLE_CUDA - return; -} + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const int &i) const { + Kokkos::abort("TeamTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const int &i) const { + Kokkos::abort("TeamVectorTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const int &i) const { + Kokkos::abort("TeamSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { + Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); + } +}; template + class BlockingType, class AlgoMode = void> struct parallel_batched_gemm { gemm_args_t gemm_args_; @@ -340,6 +519,17 @@ struct parallel_batched_gemm { gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const SerialBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { auto i = member.league_rank(); @@ -352,6 +542,18 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const MemberType &member) const { auto team_idx = member.league_rank(); @@ -368,50 +570,202 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, + const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svB = + Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svC = + Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), team_idx); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, + const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(0)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const MemberType &member) const { + Kokkos::abort("SerialSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, + const MemberType &member) const { + Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); + } }; template -void __do_gemm_parallel_batched_template(options_t options, - gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, + gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; + using policy_type = Kokkos::RangePolicy; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm_range_policy; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; - auto league_size = options.start.c.k; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; + + if (std::is_same::value || + std::is_same::value) { + batch_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), + policy_type(0, batch_size), parallel_batched_gemm_functor); + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), + policy_type(0, batch_size), parallel_batched_gemm_functor); + Kokkos::fence(); } - Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); return; } -template +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + Kokkos::Timer timer; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy< + TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, + gemm_args); + } + + if (std::is_same::value || + std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.ivec_4d.extent(3) + : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size / + simd_internal_vector_size; // TODO: use bp.vector_len? + } + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + + if (options.blas_args.use_auto) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -423,19 +777,23 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -796,7 +1154,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, vl>; + using simd_type = + KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -821,12 +1180,12 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { // uint64_t seed = Kokkos::Impl::clock_tic(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(B, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(C, rand_pool, - // Kokkos::rand, simd_type>::max()); - // execution_space::fence(); + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, + // simd_type>::max()); execution_space::fence(); functor_type experiment5_functor(A, B, C, gemm_args); @@ -854,9 +1213,410 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment6 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment6(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::TeamVectorGemm::invoke(member, gemm_args.alpha, + svA, svB, + gemm_args.beta, svC); + } +}; + +template +void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { +#if 0 + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment6; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + view_type A((scalar_type *)A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + internal_vector_view_type A_vector_internal(A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + + vector_view_type B_vector("B_vector", simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + view_type B((scalar_type *)B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + internal_vector_view_type B_vector_internal(B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + + vector_view_type C_vector("C_vector", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + functor_type experiment6_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); +#endif + return; +} + +/** + * Check difference of scalars expected and actual at indexes i,j,k + * @var expected: The expected result. + * @var actual: The actual result. + * @var epsilon: The tolerance to use when comparing. + * @return true if the comparison fails and false if the comparison succeeds. + */ +template +static inline bool __gemm_print_compare_failure(ViewType h_expected, + ViewType h_actual, int i, + int j, int k, double epsilon) { + STATUS; + auto diff = static_cast(Kokkos::Experimental::fabs( + static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); + + if (diff > epsilon) { + printf( + "fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i, j, k, static_cast(h_expected(i, j, k)), i, j, k, + static_cast(h_actual(i, j, k)), diff, epsilon); + FATAL_ERROR("Comparison failure!"); + return true; + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, + view_type_3d actual) { + double epsilon = Test::epsilon::value * 1e3; + STATUS; + + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + + // Copy to host for comparision + Kokkos::deep_copy(h_expected, expected); + Kokkos::deep_copy(h_actual, actual); + Kokkos::fence(); + + if (std::is_same::value) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) + return true; + } + } + } + } + + if (std::is_same::value) { + for (size_t k = 0; k < h_expected.extent(2); k++) { + for (size_t j = 0; j < h_expected.extent(1); j++) { + for (size_t i = 0; i < h_expected.extent(0); i++) { + if (__gemm_print_compare_failure(h_expected, h_actual, i, j, k, epsilon)) + return true; + } + } + } + } + + return false; +} + +template +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, + dstViewType dst, + options_t options) { + using dst_scalar_type = typename dstViewType::value_type; + using src_scalar_type = typename view_type_5d::value_type; + size_t remainder, vector_batch_size, simd_batch_size, last_batch; + bool data_layout_same_as_3d_view = false; + typename dstViewType::HostMirror h_dst = + Kokkos::create_mirror_view(dst); + typename view_type_4d::HostMirror h_src = + Kokkos::create_mirror_view(src.mat_4d); + Kokkos::deep_copy(h_src, src.mat_4d); + Kokkos::fence(); + + if (options.blas_args.batch_size_last_dim) { + remainder = dst.extent(2) % simd_internal_vector_size; + vector_batch_size = src.ivec_4d.extent(0); + simd_batch_size = src.ivec_4d.extent(3); + last_batch = dst.extent(2); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + + } else { + remainder = dst.extent(0) % simd_internal_vector_size; + vector_batch_size = src.ivec_4d.extent(3); + simd_batch_size = src.ivec_4d.extent(0); + last_batch = dst.extent(0); + if (std::is_same::value && remainder == 0) + data_layout_same_as_3d_view = true; + } + + // When the batch_size is a multiple of the simd_vector_size and the batch_size + // dimension is nearest to the simd_vector_size dimension, each 2-rank matrix + // lies in the correct location and the data can simply be cast to the 3d view. + if (data_layout_same_as_3d_view) { + // We can just re-cast the data to the 3d view but we'll copy it for verification + memcpy(h_dst.data(), h_src.data(), + sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * + dst.extent(2)); + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); + return; + } + + // If the remainder is 0, we have simd_vector_size sub-batches to copy out... + // this is a bad data access pattern but for these perf_tests we will support it. + // If the remainder is non-zero, we have simd_vector_size sub-batches + remainder to + // copy out. + remainder += simd_internal_vector_size; + + // Views needed for slow manual copy + using h_view_type_5d = Kokkos::View; + using h_subview_type_2d = Kokkos::View; + using h_subview_type_3d = Kokkos::View; + using h_subview_type_4d = Kokkos::View; + h_view_type_5d h_src_raw; + h_subview_type_4d h_sv0; + h_subview_type_3d h_sv1; + h_subview_type_2d h_sv2; + + // TODO: Clean everything below this point up... + if (std::is_same::value) + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3), simd_internal_vector_size); + else + h_src_raw = h_view_type_5d((src_scalar_type *)h_src.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + if (std::is_same::value) + h_sv0 = Kokkos::subview(h_src_raw, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), simd_internal_vec_idx); + else + h_sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + for (size_t vector_batch_idx = 0; + vector_batch_idx < vector_batch_size; vector_batch_idx++) { + if (options.blas_args.batch_size_last_dim) + h_sv1 = Kokkos::subview(h_sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + else + h_sv1 = Kokkos::subview(h_sv0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < simd_batch_size; + simd_batch_size_idx++) { + if (options.blas_args.batch_size_last_dim) + h_sv2 = Kokkos::subview(h_sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + else + h_sv2 = Kokkos::subview(h_sv1, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL()); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + if (options.blas_args.batch_size_last_dim) + h_dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = h_sv2(m, n); + else + h_dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = h_sv2(m, n); + } + } + if (simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx == last_batch - 1) + goto out; + } + } + } +out: + Kokkos::deep_copy(dst, h_dst); + Kokkos::fence(); +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, + gemm_simd_args_t actual, + options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), + expected.extent(1), expected.extent(2)); + + STATUS; + + // Copy the simd view to a 3d view for comparision. + // NOTE: The raw results are different when batch_size % simd_vector_size != + // 0. Also note that when batch_size % simd_vector_size != 0, the simd + // operation calculates results that we do not require. So, we end up running + // an extra batch_size % simd_vector_size GEMMs! + __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); + return __gemm_do_compare(expected, actual_data); +} + +template +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, + void (*fn)(options_t, gemm_args_t)) { + using execution_space = typename DeviceType::execution_space; + // Just create "expected" types using non-simd types. + decltype(gemm_args.C) C_expected; + decltype(gemm_args.A) A_expected; + decltype(gemm_args.B) B_expected; + STATUS; + + if (options.blas_args.batch_size_last_dim) { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, + gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, + gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, + gemm_args.dims.b.n, gemm_args.dims.b.k); + } else { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, + gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, + gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, + gemm_args.dims.b.m, gemm_args.dims.b.n); + } + + // Initialize "expected" matrices. + if (gemm_args.C.data() != nullptr) { + Kokkos::deep_copy(C_expected, gemm_args.C); + Kokkos::deep_copy(A_expected, gemm_args.A); + Kokkos::deep_copy(B_expected, gemm_args.B); + + Kokkos::fence(); // Ensure that deep_copy has completed + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Inital values mismatch!"); + } else if (gemm_args.Cv.vec_3d.data() != nullptr) { + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, + C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, + A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, + B_expected, options); + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) + FATAL_ERROR("Inital values mismatch!"); + } else { + FATAL_ERROR("Input arguments are empty!"); + } + + // Populate "expected" matrices via VanillaGemm + Test::Functor_BatchedVanillaGEMM + vgemm; + vgemm.A_t = toupper(gemm_args.transA) == 'T'; + vgemm.B_t = toupper(gemm_args.transB) == 'T'; + vgemm.A_c = vgemm.B_c = false; + vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. + auto warm_up_n_bak = options.warm_up_n; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; + fn(options, gemm_args); + + Kokkos::fence(); // Redundant fence. + + // Check the result + if (gemm_args.C.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Result value mismatch!"); + } + + if (gemm_args.Cv.vec_3d.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) + FATAL_ERROR("Result value mismatch!"); + } + + // Run actual timed test. + options.verify = false; // Set verify to false for csv output. + options.warm_up_n = warm_up_n_bak; + options.n = n_bak; + fn(options, gemm_args); + + // Reset verify for next matrix size. + options.verify = true; +} + /*************************** Internal setup fns **************************/ template -gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { +gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; @@ -864,25 +1624,151 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; - gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; - gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + gemm_args.dims = dims; + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + if (options.test == BATCHED_TEAM_SIMD || + options.test == BATCHED_TEAM_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_SIMD || + options.test == BATCHED_SERIAL_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_COMPACT_MKL) { + // Calculate the batch size for simd views + auto a_simd_batch_size = + dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = + dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = + dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + + // Reference gemm simd arguments for allocating A, B, and C matrices + gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; + + if (options.blas_args.batch_size_last_dim) { + // Construct simd matrices with batch_size in the last dimension (better + // for LayoutLeft views) + A.vec_3d = vector_view_type_3d("A_vector", dims.a.m, dims.a.n, + a_simd_batch_size); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), simd_vector_size, + dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.a.m, dims.a.n, + a_simd_batch_size); + + B.vec_3d = vector_view_type_3d("B_vector", dims.b.m, dims.b.n, + b_simd_batch_size); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), simd_vector_size, + dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.b.m, dims.b.n, + b_simd_batch_size); + + C.vec_3d = vector_view_type_3d("C_vector", dims.c.m, dims.c.n, + c_simd_batch_size); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), simd_vector_size, + dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.c.m, dims.c.n, + c_simd_batch_size); + + } else { + // Construct simd matrices with batch_size in the first dimension (better + // for LayoutRight views) + A.vec_3d = vector_view_type_3d("A_vector", a_simd_batch_size, dims.a.m, + dims.a.n); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), a_simd_batch_size, + dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, + dims.a.n, simd_vector_size / simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d("B_vector", b_simd_batch_size, dims.b.m, + dims.b.n); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), b_simd_batch_size, + dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, + dims.b.n, simd_vector_size / simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d("C_vector", c_simd_batch_size, dims.c.m, + dims.c.n); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), c_simd_batch_size, + dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, + dims.c.n, simd_vector_size / simd_internal_vector_size); + } + + // Use the non-simd 4-rank view type to randomly populate the gemm simd + // arguments + using tmp_view_type_4d = + Kokkos::View; + tmp_view_type_4d tmpA( + "tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), + gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + Kokkos::fill_random(tmpA, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_4d tmpB( + "tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), + gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + Kokkos::fill_random(tmpB, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_4d tmpC( + "tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), + gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + Kokkos::fill_random(tmpC, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.Av.mat_4d, tmpA); + Kokkos::deep_copy(gemm_args.Bv.mat_4d, tmpB); + Kokkos::deep_copy(gemm_args.Cv.mat_4d, tmpC); + Kokkos::fence(); + } else { + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + } + + using tmp_view_type_3d = + Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.A.extent(2)); + Kokkos::fill_random(tmpA, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), + gemm_args.B.extent(2)); + Kokkos::fill_random(tmpB, rand_pool, + Kokkos::rand, + double>::max()); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), + gemm_args.C.extent(2)); + Kokkos::fill_random(tmpC, rand_pool, + Kokkos::rand, + double>::max()); + + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.A, tmpA); + Kokkos::deep_copy(gemm_args.B, tmpB); + Kokkos::deep_copy(gemm_args.C, tmpC); + Kokkos::fence(); + } gemm_args.alpha = options.blas_args.gemm.alpha; - gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fill_random(gemm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, - Kokkos::rand, - scalar_type>::max()); + Kokkos::fence(); // Ensure that fill_random has completed. return gemm_args; } @@ -897,7 +1783,8 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + << ", DEVICE:" << typeid(default_device).name() + << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -905,12 +1792,18 @@ void __do_loop_and_invoke(options_t options, cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, - cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step, + cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); - fn(options, gemm_args); + + if (options.verify) { + __gemm_do_verify( + options, gemm_args, fn); + } else { + fn(options, gemm_args); + } } return; } @@ -944,44 +1837,173 @@ void do_gemm_serial_batched_blocked(options_t options) { void do_gemm_serial_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_parallel(options_t options) { + STATUS; + // SerialBatchDim3Tag + // SerialSimdTag + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { + STATUS; + // SerialBatchDim3Tag + // SerialSimdTag + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { + STATUS; +#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); +#else +#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + "undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ " + "is undefined." + << std::endl; +#endif +#endif return; } void do_gemm_team_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } +void do_gemm_team_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +// Blocked algo not yet implemented for TeamVectorGemm. /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( @@ -1010,6 +2032,9 @@ void do_gemm_experiment_parallel(options_t options) { __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment6); } #endif // KOKKOSBLAS3_GEMM_PERF_TEST_H_ diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index b493c244d8..149cc00fd1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -63,17 +63,19 @@ static struct option long_options[] = { {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, {"gemm_options", required_argument, 0, 'g'}, - {"gemm_alpha", required_argument, 0, 'p'}, + {"gemm_scalars", required_argument, 0, 'p'}, {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, + {"batch_size_last_dim", required_argument, 0, 'd'}, + {"verify", required_argument, 0, 'v'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { printf("Options:\n"); printf("\t-h, --help\n"); - printf("\t\tPrint this help menu.\n\n"); + printf("\t\tPrint this help menu.\n"); printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); @@ -104,10 +106,12 @@ static void __print_help_blas3_perf_test() { "%s)\n", DEFAULT_GEMM_ARGS); - printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); - printf("\t\tGEMM alpha value.\n"); - printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", - DEFAULT_GEMM_ALPHA); + printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n"); + printf("\t\tGEMM alpha and beta values.\n"); + printf( + "\t\t\tThe value of alpha and beta in floating point. (default: " + "%lf,%lf)\n", + DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA); printf("\t-z, --team_size=SIZE\n"); printf("\t\tKokkos team size.\n"); @@ -119,10 +123,27 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); + printf("\t-u, --use_auto=AUTO\n"); + printf( + "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " + "(Heirarchical parallelism).\n"); + printf( + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use " + "--vector_len and --team_size " + "instead. (default: %d)\n", + DEFAULT_USE_AUTO); + printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); - printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", - DEFAULT_VECTOR_LEN); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); + + printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); + printf("\t\tHow to allocate the batch_size in the matrices.\n"); + printf( + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last " + "dimension and 0 to make the batch_size " + "the first dimension (default: %d)\n", + DEFAULT_BATCH_SIZE_LAST_DIM); printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); @@ -134,7 +155,7 @@ static void __print_help_blas3_perf_test() { printf("%c[1m", 27); printf("\t\t\t\tparallel:"); printf("%c[0m", 27); - printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); + printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n"); printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); printf( @@ -142,7 +163,7 @@ static void __print_help_blas3_perf_test() { "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); @@ -152,7 +173,7 @@ static void __print_help_blas3_perf_test() { "(stop)\n"); printf( "\t\t\tValid dimension values are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); @@ -160,35 +181,43 @@ static void __print_help_blas3_perf_test() { printf("\t\tMatrix step selection.\n"); printf( "\t\t\tValid value for K is any non-negative 32-bit integer. (default: " - "%d)\n\n", + "%d)\n", DEFAULT_STEP); printf("\t-w, --warm_up_loop=LOOP\n"); printf("\t\tWarm up loop selection. (untimed)\n"); printf( "\t\t\tValid value for LOOP is any non-negative 32-bit integer that's <= " - "ITER. (default: %d)\n\n", + "ITER. (default: %d)\n", DEFAULT_WARM_UP_N); printf("\t-i, --iter=ITER\n"); printf("\t\tIteration selection. (timed)\n"); printf( "\t\t\tValid value for ITER is any non-negative 32-bit integer. " - "(default: %d)\n\n", + "(default: %d)\n", DEFAULT_N); printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( "\t\t\tValid value for /path/to/file.csv is any valid file name. " - "(default: stdout)\n\n"); + "(default: stdout)\n"); printf("\t-r, --routines=ROUTINES\n"); printf("\t\tRoutine selection.\n"); printf( "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " - "delimited by a comma. (default: %s)\n\n", + "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); + + printf("\t-v, --verify=VERIFY\n"); + printf("\t\tVerification selection. (untimed)\n"); + printf( + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to " + "verify before timing. " + "(default: %d)\n", + DEFAULT_VERIFY); } static void __blas3_perf_test_input_error(char **argv, char short_opt, @@ -211,42 +240,47 @@ int main(int argc, char **argv) { }; /* set default options */ - options.test = DEFAULT_TEST; - options.loop = DEFAULT_LOOP; - options.start.a.k = DEFAULT_K; - options.start.a.m = DEFAULT_MATRIX_START; - options.start.a.n = DEFAULT_MATRIX_START; - options.stop.a.k = DEFAULT_K; - options.stop.a.m = DEFAULT_MATRIX_STOP; - options.stop.a.n = DEFAULT_MATRIX_STOP; - options.start.b.k = DEFAULT_K; - options.start.b.m = DEFAULT_MATRIX_START; - options.start.b.n = DEFAULT_MATRIX_START; - options.stop.b.k = DEFAULT_K; - options.stop.b.m = DEFAULT_MATRIX_STOP; - options.stop.b.n = DEFAULT_MATRIX_STOP; - options.start.c.k = DEFAULT_K; - options.start.c.m = DEFAULT_MATRIX_START; - options.start.c.n = DEFAULT_MATRIX_START; - options.stop.c.k = DEFAULT_K; - options.stop.c.m = DEFAULT_MATRIX_STOP; - options.stop.c.n = DEFAULT_MATRIX_STOP; - options.step = DEFAULT_STEP; - options.warm_up_n = DEFAULT_WARM_UP_N; - options.n = DEFAULT_N; - options.out = DEFAULT_OUT; - options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); - options.blas_args.team_size = DEFAULT_TEAM_SIZE; - options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.test = DEFAULT_TEST; + options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; + options.start.a.m = DEFAULT_MATRIX_START; + options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; + options.stop.a.m = DEFAULT_MATRIX_STOP; + options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; + options.start.b.m = DEFAULT_MATRIX_START; + options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; + options.stop.b.m = DEFAULT_MATRIX_STOP; + options.stop.b.n = DEFAULT_MATRIX_STOP; + options.start.c.k = DEFAULT_K; + options.start.c.m = DEFAULT_MATRIX_START; + options.start.c.n = DEFAULT_MATRIX_START; + options.stop.c.k = DEFAULT_K; + options.stop.c.m = DEFAULT_MATRIX_STOP; + options.stop.c.n = DEFAULT_MATRIX_STOP; + options.step = DEFAULT_STEP; + options.warm_up_n = DEFAULT_WARM_UP_N; + options.n = DEFAULT_N; + options.out = DEFAULT_OUT; + options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); + options.blas_args.team_size = DEFAULT_TEAM_SIZE; + options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.blas_args.use_auto = DEFAULT_USE_AUTO; + options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; + options.verify = DEFAULT_VERIFY; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", - long_options, &option_idx)) != -1) { + while ( + (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': @@ -269,14 +303,19 @@ int main(int argc, char **argv) { break; case 'g': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (strlen(optarg) != 3) { + if (strlen(optarg) != 2) { __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.gemm.gemm_args = optarg; break; case 'p': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + double alpha, beta; + if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) + __blas3_perf_test_input_error(argv, ret, optarg); + + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); @@ -361,8 +400,11 @@ int main(int argc, char **argv) { options.stop.a.k = options.stop.b.k = options.stop.c.k = atoi(optarg); break; + case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; + case 'v': options.verify = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; + case 'u': options.blas_args.use_auto = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 70f7664679..de2bbd9ce9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -72,6 +72,62 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 +/** + * The KokkosBatched::SerialTrmm implementation performs dot products on + * non-zero elements of the triangular matrices. The flop calculation below + * assumes KokkosBatched::SerialTrmm is being used. Since the dot products + * do a multiply and add we can calculate the flops for any element in the last + * column of the LHS to be 2*columns_LHS, any element in the last-1 column of + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the + * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops + * = (flops / 2) * 2 flops = flops * rows_LHS + */ +static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { + int flops; + + if (side == 'L' || side == 'l') { + flops = (b_m * (b_m + 1)) * b_n; + } else { + flops = (b_n * (b_n + 1)) * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return flops * 4; +} + +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline double __trmm_flop_count(char side, double b_m, double b_n, + double a_m, double a_n) { + double flops; + + if (side == 'L' || side == 'l') { + flops = b_m * b_m * b_n; + } else { + flops = b_n * b_n * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return flops * 4; +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -83,19 +139,54 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/" + "average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { + double flops = trmm_args.A.extent(0) * + __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * sizeof(default_scalar)) / + 1e9; + double min_memory_transactions, max_memory_transactions; + + // Assuming infinite cache size + // We have to read A and B into the cache once and then write + // B back out to main memory once. + min_memory_transactions = 3; + + // Assuming no register or real caching + // We have to go out to memory for every element we read from A and B as well + // as every element we write to B. We use the trmm flops from lapack note 41 + // and multiple by 3/2 to account for the write to B since this flop count is + // for one multiply and one add. + if (trmm_args.side == 'l' || trmm_args.side == 'L') + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * (3. / 2.); + else + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * + trmm_args.B.extent(1) * (3. / 2.); + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," - << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(1) + << static_cast(options.blas_args.trmm.alpha) << "," + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) + << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) + << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," << flops << "," << gflops / average_time + << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time + << "," + << (gbytes_in_matrix * max_memory_transactions) / average_time + << std::endl; } static void __print_trmm_perf_test_options(options_t options) { @@ -131,24 +222,30 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -167,21 +264,28 @@ void __do_trmm_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trmm::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -306,6 +410,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { +// TODO: Note why this is disabled on CUDA and HIP #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; @@ -316,16 +421,24 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for( + "parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for( + "parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -368,16 +481,24 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for( + "parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for( + "parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); return; @@ -498,19 +619,24 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.uplo = options.blas_args.trmm.trmm_args.c_str()[1]; trmm_args.trans = options.blas_args.trmm.trmm_args.c_str()[2]; trmm_args.diag = options.blas_args.trmm.trmm_args.c_str()[3]; - trmm_args.A = vta("trmm_args.A", options.n, dim.a.m, dim.a.n); - trmm_args.B = vtb("trmm_args.B", options.n, dim.b.m, dim.b.n); + trmm_args.A = vta("trmm_args.A", dim.a.k, dim.a.m, dim.a.n); + trmm_args.B = vtb("trmm_args.B", dim.b.k, dim.b.m, dim.b.n); trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - Kokkos::fill_random(trmm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::deep_copy(host_A, trmm_args.A); + { + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(host_A, tmp); + } if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -522,7 +648,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -533,7 +659,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trmm_args.diag == 'U' || trmm_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -542,9 +668,15 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } Kokkos::deep_copy(trmm_args.A, host_A); - Kokkos::fill_random(trmm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); + { + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), + trmm_args.B.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(trmm_args.B, tmp); + } return trmm_args; } @@ -566,8 +698,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trmm_args = __do_setup( options, cur_dims); diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 4f990f98ca..f4833cda17 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -60,6 +60,69 @@ unsigned cg_iteration_limit = 10; +template +crsMat_t create_crs_matrix(char *mtx_bin_file) { + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using row_map_view_t = typename graph_t::row_map_type::non_const_type; + using cols_view_t = typename graph_t::entries_type::non_const_type; + using values_view_t = typename crsMat_t::values_type::non_const_type; + using myExecSpace = typename crsMat_t::execution_space; + + crsMat_t crsmat; + + printf("matrix file: %s\n", mtx_bin_file); + + if(std::string(mtx_bin_file) == "auto") { + INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40; + crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(num_rows, num_cols, nnz, 3, 5); + printf("generating test matrix automatically\n"); + printf(" num rows: %d", num_rows); + printf(" num cols: %d", num_cols); + printf(" num non zeros: %d\n", nnz); + } else { + INDEX_TYPE nv = 0, ne = 0; + INDEX_TYPE *xadj, *adj; + SCALAR_TYPE *ew; + + KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); + + row_map_view_t rowmap_view ("rowmap_view", nv+1); + cols_view_t columns_view("colsmap_view", ne); + values_view_t values_view ("values_view", ne); + + if(KokkosKernels::Impl::kk_is_gpu_exec_space()) { + typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); + typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); + typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); + + for (INDEX_TYPE i = 0; i <= nv; ++i){ + hr(i) = xadj[i]; + } + for (INDEX_TYPE i = 0; i < ne; ++i){ + hc(i) = adj[i]; + hv(i) = ew[i]; + } + + Kokkos::deep_copy (rowmap_view , hr); + Kokkos::deep_copy (columns_view , hc); + Kokkos::deep_copy (values_view , hv); + } else { + KokkosKernels::Impl::copy_vector(ne, ew, values_view); + KokkosKernels::Impl::copy_vector(ne, adj, columns_view); + KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + } + + graph_t static_graph (columns_view, rowmap_view); + crsmat = crsMat_t("CrsMatrix", nv, values_view, static_graph); + delete [] xadj; + delete [] adj; + delete [] ew; + } + + return crsmat; +} + template scalar_view_t create_x_vector(INDEX_TYPE nv, SCALAR_TYPE max_value = 1.0){ @@ -338,10 +401,11 @@ enum { CMD_USE_THREADS = 0 , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA , CMD_USE_OPENMP + , CMD_USE_SERIAL , CMD_USE_CUDA_DEV , CMD_BIN_MTX , CMD_ERROR - , CMD_COUNT }; + , CMD_COUNT}; int main (int argc, char ** argv){ @@ -355,7 +419,10 @@ int main (int argc, char ** argv){ for ( int i = 1 ; i < argc ; ++i ) { - if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { + if ( 0 == strcasecmp( argv[i] , "--serial" ) ) { + cmdline[ CMD_USE_SERIAL ] = 1 ; + } + else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { kargs.num_threads = cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] ); } else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) { @@ -380,14 +447,14 @@ int main (int argc, char ** argv){ std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + return 1; } } if (mtx_bin_file == NULL){ - std::cerr << "Provide a mtx binary file" << std::endl ; - std::cerr << "OPTIONS\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file]" << std::endl; - return 0; + std::cerr << "Provide a mtx binary file or specify auto-generation" << std::endl ; + std::cerr << "OPTIONS\n\t--serial\n\t--threads [numThreads]\n\t--openmp [numThreads]\n\t--cuda\n\t--cuda-dev[DeviceIndex]\n\t--mtx[binary_mtx_file|auto]" << std::endl; + return 1; } std::cout << "Running experiments with block size:" << block_size << std::endl; @@ -395,39 +462,37 @@ int main (int argc, char ** argv){ Kokkos::initialize(kargs); -#if defined( KOKKOS_ENABLE_THREADS ) +#if defined( KOKKOS_ENABLE_SERIAL ) - if ( cmdline[ CMD_USE_THREADS ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + if ( cmdline[ CMD_USE_SERIAL ] ) { + using myExecSpace = Kokkos::Serial; + Kokkos::Serial::print_configuration(std::cout); + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - Kokkos::Threads::print_configuration(std::cout); - - typedef Kokkos::Threads myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; + using values_view_t = typename crsMat_t::values_type::non_const_type; + values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); + for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ + kok_x_original(i) = 0; + } + run_experiment(crsmat, kok_x_original, block_size); + } - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; +#endif - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); +#if defined( KOKKOS_ENABLE_THREADS ) - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); + if ( cmdline[ CMD_USE_THREADS ] ) { + using myExecSpace = Kokkos::Threads; + Kokkos::Threads::print_configuration(std::cout); - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; @@ -440,47 +505,19 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_OPENMP ) if ( cmdline[ CMD_USE_OPENMP ] ) { - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; - - + using myExecSpace = Kokkos::OpenMP; Kokkos::OpenMP::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::OpenMP myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - KokkosKernels::Impl::copy_vector(ne, ew, values_view); - KokkosKernels::Impl::copy_vector(ne, adj, columns_view); - KokkosKernels::Impl::copy_vector(nv+1, xadj, rowmap_view); - - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - //crsMat_t crsmat("CrsMatrix", nv, nv, ne, ew, xadj, adj); - delete [] xadj; - delete [] adj; - delete [] ew; - + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); for (INDEX_TYPE i = nv; i < ((nv /block_size) + 1) * block_size; ++i){ kok_x_original(i) = 0; } run_experiment(crsmat, kok_x_original, block_size); - } #endif @@ -488,57 +525,16 @@ int main (int argc, char ** argv){ #if defined( KOKKOS_ENABLE_CUDA ) if ( cmdline[ CMD_USE_CUDA ] ) { // Use the last device: - INDEX_TYPE nv = 0, ne = 0; - INDEX_TYPE *xadj, *adj; - SCALAR_TYPE *ew; + using myExecSpace = Kokkos::Cuda; Kokkos::Cuda::print_configuration(std::cout); - KokkosKernels::Impl::read_matrix (&nv, &ne, &xadj, &adj, &ew, mtx_bin_file); - - - typedef Kokkos::Cuda myExecSpace; - typedef typename KokkosSparse::CrsMatrix crsMat_t; - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename crsMat_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsMat_t::index_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - row_map_view_t rowmap_view("rowmap_view", nv+1); - cols_view_t columns_view("colsmap_view", ne); - values_view_t values_view("values_view", ne); - - - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (INDEX_TYPE i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (INDEX_TYPE i = 0; i < ne; ++i){ - hc(i) = adj[i]; - hv(i) = ew[i]; - } - Kokkos::deep_copy (rowmap_view , hr); - Kokkos::deep_copy (columns_view , hc); - Kokkos::deep_copy (values_view , hv); - - - } - graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nv, values_view, static_graph); - - delete [] xadj; - delete [] adj; - delete [] ew; + using crsMat_t = typename KokkosSparse::CrsMatrix; + crsMat_t crsmat = create_crs_matrix(mtx_bin_file); + INDEX_TYPE nv = crsmat.numRows(); + using values_view_t = typename crsMat_t::values_type::non_const_type; values_view_t kok_x_original = create_x_vector(((nv /block_size) + 1) * block_size, MAXVAL); run_experiment(crsmat, kok_x_original, block_size); - - } #endif diff --git a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp index 285bd1038f..d769d3a4da 100644 --- a/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp +++ b/perf_test/sparse/spmv/OpenMPSmartStatic_SPMV.hpp @@ -138,7 +138,7 @@ void openmp_smart_static_matvec(AType A, XType x, YType y) { #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index c5f7148125..f41cd818bf 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -445,12 +445,12 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then MODULE_ENVIRONMENT="source /projects/sems/modulefiles/utils/sems-modules-init.sh" eval "$MODULE_ENVIRONMENT" - module load sems-cmake/3.12.2 - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - CUDA11_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/9.2.0" - CLANG7_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-cuda/9.2" + module load sems-cmake/3.17.1 + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + CUDA11_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/9.2.0" + CLANG7_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-cuda/9.2" SKIP_HWLOC=True if [ -z "$ARCH_FLAG" ]; then @@ -502,16 +502,16 @@ elif [ "$MACHINE" = "white" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,netlib/3.8.0/gcc/7.2.0" - GCC74_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.3.4/gcc/7.4.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" - IBM_MODULE_TPL_LIST="cmake/3.12.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,netlib/3.8.0/gcc/7.2.0" + GCC74_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.3.4/gcc/7.4.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" + IBM_MODULE_TPL_LIST="cmake/3.19.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -534,7 +534,8 @@ elif [ "$MACHINE" = "white" ]; then ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/16.1.1 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" @@ -555,14 +556,14 @@ elif [ "$MACHINE" = "weaver" ]; then eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - BASE_MODULE_LIST="cmake/3.12.3,/" - IBM_MODULE_LIST="cmake/3.12.3,/xl/,gcc/7.2.0" - CUDA_MODULE_LIST="cmake/3.12.3,/,gcc/7.2.0,ibm/xl/16.1.1" - CUDA10_MODULE_LIST="cmake/3.12.3,/,gcc/7.4.0,ibm/xl/16.1.1" + BASE_MODULE_LIST="cmake/3.19.3,/" + IBM_MODULE_LIST="cmake/3.19.3,/xl/,gcc/7.2.0" + CUDA_MODULE_LIST="cmake/3.19.3,/,gcc/7.2.0,ibm/xl/16.1.1" + CUDA10_MODULE_LIST="cmake/3.19.3,/,gcc/7.4.0,ibm/xl/16.1.1" - GCC72_MODULE_TPL_LIST="cmake/3.12.3,/,openblas/0.2.20/gcc/7.2.0" - CUDA_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" - CUDA10_MODULE_TPL_LIST="cmake/3.12.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + GCC72_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.2.20/gcc/7.2.0" + CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" + CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS @@ -609,7 +610,7 @@ elif [ "$MACHINE" = "voltrino" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/19.05.5a,/,gcc/9.3.0" + BASE_MODULE_LIST="PrgEnv-intel,craype-mic-knl,cmake/3.16.2,slurm/20.11.4a,/,gcc/9.3.0" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("intel/17.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -624,14 +625,13 @@ elif [ "$MACHINE" = "mayer" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=96 - BASE_MODULE_LIST="cmake/3.14.5,/" -# ARM_MODULE_LIST="cmake/3.12.2,/" + BASE_MODULE_LIST="cmake/3.17.1,/" ARMCLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Wsign-compare,-Wtype-limits,-Wuninitialized" # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("gnu7/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "arm/20.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") + "arm/20.1 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $ARMCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=ARMV8_THUNDERX2" @@ -650,7 +650,7 @@ elif [ "$MACHINE" = "caraway" ]; then HIPCLANG_WARNING_FLAGS="" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("rocm/3.8.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") + COMPILERS=("rocm/3.10.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS") if [ -z "$ARCH_FLAG" ]; then ARCH_FLAG="--arch=VEGA900" @@ -661,10 +661,14 @@ elif [ "$MACHINE" = "blake" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - module load cmake/3.12.3 + module load cmake/3.19.3 - BASE_MODULE_LIST="cmake/3.12.3,/" - BASE_MODULE_LIST_INTEL="cmake/3.12.3,/compilers/" + BASE_MODULE_LIST="cmake/3.19.3,/" + BASE_MODULE_LIST_INTEL="cmake/3.19.3,/compilers/" + BASE_MODULE_LIST_ONEAPI="cmake/3.19.3,/oneAPI/base-toolkit/" + ONEAPI_WARNING_FLAGS="" + + GCC72_MODULE_TPL_LIST="$BASE_MODULE_LIST,openblas/0.2.20/gcc/7.2.0" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) @@ -672,14 +676,14 @@ elif [ "$MACHINE" = "blake" ]; then #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" + "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -687,12 +691,14 @@ elif [ "$MACHINE" = "blake" ]; then "intel/19.1.144 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.3.199 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/19.5.281 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/2021.1.1 $BASE_MODULE_LIST_ONEAPI $INTEL_BUILD_LIST icpx $ONEAPI_WARNING_FLAGS" "gcc/5.5.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.4.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) fi @@ -707,21 +713,21 @@ elif [ "$MACHINE" = "apollo" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb module load binutils SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - CUDA9_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA10_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" - CUDA101_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + CUDA9_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA10_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" + CUDA101_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/9.0.69" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/5.3.0" -# HPX_MODULE_LIST="sems-env,sems-cmake/3.12.2,hpx/1.2.1,sems-gcc/6.1.0,binutils" -# HPX3_MODULE_LIST="sems-env,sems-cmake/3.12.2,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/9.0.69" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/5.3.0" +# HPX_MODULE_LIST="sems-env,sems-cmake/3.17.1,hpx/1.2.1,sems-gcc/6.1.0,binutils" +# HPX3_MODULE_LIST="sems-env,sems-cmake/3.17.1,compilers/hpx/1.3.0,sems-gcc/6.1.0,binutils" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread" @@ -761,19 +767,19 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then module load sems-git module load sems-tex - module load sems-cmake/3.12.2 + module load sems-cmake/3.17.1 module load sems-gdb SKIP_HWLOC=True - BASE_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/" - GCC91_MODULE_LIST="sems-env,sems-cmake/3.12.2,/" - NVCC_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/7.3.0" - NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/7.3.0" - NVCC11_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,sems-gcc/9.2.0" + BASE_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/" + GCC91_MODULE_LIST="sems-env,sems-cmake/3.17.1,/" + NVCC_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/7.3.0" + NVCC_SEMSMODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/7.3.0" + NVCC11_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,sems-gcc/9.2.0" - CLANG_MODULE_LIST="sems-env,sems-cmake/3.12.2,sems-/,sems-gcc/6.1.0" - CLANG8_MODULE_LIST="sems-env,sems-cmake/3.12.2,/,cuda/10.0" + CLANG_MODULE_LIST="sems-env,sems-cmake/3.17.1,sems-/,sems-gcc/6.1.0" + CLANG8_MODULE_LIST="sems-env,sems-cmake/3.17.1,/,cuda/10.0" BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" @@ -1077,7 +1083,7 @@ setup_env() { if [[ "${SPOT_CHECK_TPLS}" = "True" ]]; then # Some machines will require explicitly setting include dirs and libs - if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]]) && [[ "$mod" = openblas* ]]; then + if ([[ "$MACHINE" = white* ]] || [[ "$MACHINE" = weaver* ]] || [[ "$MACHINE" = blake* ]]) && [[ "$mod" = openblas* ]]; then BLAS_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" LAPACK_LIBRARY_DIRS="${OPENBLAS_ROOT}/lib" # BLAS_LIBRARIES="openblas" @@ -1104,8 +1110,8 @@ setup_env() { done if [ -e ${CM_ALL_SCRIPT_PATH}/update_lib.sh ]; then - echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE" - source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE + echo "calling ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler" + source ${CM_ALL_SCRIPT_PATH}/update_lib.sh $MACHINE $compiler fi return 0 diff --git a/scripts/docker/Dockerfile.hip b/scripts/docker/Dockerfile.hip new file mode 100644 index 0000000000..2db14b1009 --- /dev/null +++ b/scripts/docker/Dockerfile.hip @@ -0,0 +1,28 @@ +ARG BASE=rocm/dev-ubuntu-20.04:3.10 +FROM $BASE + +RUN apt-get update && apt-get install -y \ + git \ + wget \ + && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH=/opt/rocm/bin:$PATH + +ARG CMAKE_VERSION=3.18.5 +ENV CMAKE_DIR=/opt/cmake +RUN CMAKE_KEY=2D2CEF1034921684 && \ + CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} && \ + CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh && \ + CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc && \ + wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} && \ + gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} && \ + gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} && \ + grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check && \ + mkdir -p ${CMAKE_DIR} && \ + sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} && \ + rm cmake* +ENV PATH=${CMAKE_DIR}/bin:$PATH diff --git a/scripts/update_lib.sh b/scripts/update_lib.sh index 822efa28b8..34ab5dd3c9 100755 --- a/scripts/update_lib.sh +++ b/scripts/update_lib.sh @@ -1,30 +1,53 @@ #!/bin/bash -if [ "$1" = blake ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then - module swap gcc/4.9.3 gcc/6.4.0 - module list - fi -fi -if [ "$1" = kokkos-dev ]; then +local machine_input="$1" +local compiler_input="$2" + +check_sems_intel() { ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" if [[ "${ICPCVER}" = 17.* ]]; then module swap sems-gcc/4.9.3 sems-gcc/6.4.0 module list fi -fi -if [ "$1" = kokkos-dev-2 ]; then - ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.9.3 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 19.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/6.1.0 sems-gcc/7.2.0 module list fi -fi -if [ "$1" = sems ]; then +} + +check_sems_clang() { + CLANGVER=$(clang --version | grep "clang version" | cut -d " " -f 3) + if [[ "${CLANGVER}" = 9.* ]] || [[ "${CLANGVER}" = 10.* ]]; then + # Newer gcc needed for c++ standard beyond c++14 + module swap sems-gcc/5.3.0 sems-gcc/6.4.0 + module list + fi +} + +check_compiler_modules() { + if [[ "$compiler_input" = clang/* ]]; then + echo " clang compiler - check supporting modules" + check_sems_clang + elif [[ "$compiler_input" = intel/* ]]; then + echo " intel compiler - check supporting modules" + check_sems_intel + fi +} + +if [ "$machine_input" = blake ]; then ICPCVER="$(icpc --version | grep icpc | cut -d ' ' -f 3)" - if [[ "${ICPCVER}" = 17.* ]]; then - module swap sems-gcc/4.8.4 sems-gcc/6.4.0 + if [[ "${ICPCVER}" = 17.* || "${ICPCVER}" = 18.0.128 ]]; then + module swap gcc/4.9.3 gcc/6.4.0 module list fi fi +if [ "$machine_input" = kokkos-dev ]; then + check_compiler_modules +fi +if [ "$machine_input" = kokkos-dev-2 ]; then + check_compiler_modules +fi +if [ "$machine_input" = sems ] || [ "$machine_input" = sogpu ]; then + check_compiler_modules +fi diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 22c17b5247..57b5394107 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -437,4 +437,5 @@ KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MKL) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUBLAS) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC CUSPARSE) KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC METIS) +KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC ARMPL) # Not yet here KOKKOSKERNELS_LINK_TPL(kokkoskernels PUBLIC MAGMA) diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp index 83c483a3d6..f96ffc49c3 100644 --- a/src/Kokkos_ArithTraits.hpp +++ b/src/Kokkos_ArithTraits.hpp @@ -856,15 +856,19 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const float x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (x); } static KOKKOS_FORCEINLINE_FUNCTION mag_type abs (const float x) { @@ -1030,18 +1034,52 @@ class ArithTraits > { return std::complex (ArithTraits::infinity (), ArithTraits::infinity ()); } - static bool isInf (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf +#endif return isinf (real (x)) || isinf (imag (x)); } - static bool isNan (const std::complex& x) { - #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + template <> + static bool isInf(const std::complex& x) { + Kokkos::abort("isInf not available for std::complex!\n"); + return true; + } +#else + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#endif + return isinf (real (x)) || isinf (imag (x)); + } +#endif +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; - #endif +#elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan +#endif return isnan (real (x)) || isnan (imag (x)); } + template <> + static bool isNan(const std::complex& x) { + Kokkos::abort("isNan not available for std::complex!\n"); + return true; + } +#else + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#endif + return isnan (real (x)) || isnan (imag (x)); + } +#endif static mag_type abs (const std::complex& x) { return std::abs (x); } @@ -1213,12 +1251,16 @@ class ArithTraits { static KOKKOS_FORCEINLINE_FUNCTION bool isInf (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isinf; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isinf; #endif return isinf (x); } static KOKKOS_FORCEINLINE_FUNCTION bool isNan (const val_type x) { #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST using std::isnan; + #elif KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::isnan; #endif return isnan (x); } diff --git a/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp b/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp index 07ca8933cf..f46a278e8b 100644 --- a/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp +++ b/src/batched/KokkosBatched_Eigendecomposition_Serial_Internal.hpp @@ -57,6 +57,7 @@ namespace KokkosBatched { RealType * w, const int wlen) { /// until debugging is done, comment out the code /// testing happens only for TPLs on host. + static_assert(false, "Serial eigendecomposition on device and/or without LAPACK is not implemented yet"); // typedef RealType real_type; // typedef Kokkos::Details::ArithTraits ats; @@ -356,9 +357,12 @@ namespace KokkosBatched { RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) - if (as0 == 1 || as1 == 1) { + //if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host /// potentially it can run tpls internally + // NOTE BMK: If LAPACK not enabled, this will static_assert. + // If neither stride is unit, will runtime assert. + // Otherwise will succeed using LAPACK. host_invoke(m, A, as0, as1, er, ers, @@ -366,6 +370,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + /* } else { /// arbitrary strides should be handled by native implementation device_invoke(m, @@ -375,7 +380,9 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("Serial eigendecomposition without unit stride implemented yet."); } + */ #else /// device code runs device_invoke(m, diff --git a/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp b/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp index b63d847786..7c4026d1e9 100644 --- a/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp +++ b/src/batched/KokkosBatched_Eigendecomposition_TeamVector_Internal.hpp @@ -76,6 +76,8 @@ namespace KokkosBatched { RealType * UL, const int uls0, const int uls1, RealType * UR, const int urs0, const int urs1, RealType * w, const int wlen) { + static_assert(false, "TeamVector eigendecomposition is not implemented yet."); + /* #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST) if (as0 == 1 || as1 == 1) { /// column major or row major and it runs on host @@ -100,6 +102,7 @@ namespace KokkosBatched { UL, uls0, uls1, UR, urs0, urs1, w, wlen); + throw std::runtime_error("TeamVector eigendecomposition is not implemented yet."); } #else /// device code runs @@ -111,6 +114,7 @@ namespace KokkosBatched { UR, urs0, urs1, w, wlen); #endif +*/ return 0; } }; diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 3253b6ce12..89dd200150 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -204,7 +204,8 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same >::value, + std::is_same >::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; }; @@ -281,6 +282,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 2; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 2; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> @@ -330,6 +341,16 @@ namespace KokkosBatched { template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> ::type mb() { return 1; } +#endif +#if defined(KOKKOS_ENABLE_SYCL) + template + KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if< + std::is_same::value, + int>::type + mb() { + return 1; + } #endif template KOKKOS_INLINE_FUNCTION static constexpr typename std::enable_if::value,int> diff --git a/src/batched/KokkosBatched_Vector_SIMD.hpp b/src/batched/KokkosBatched_Vector_SIMD.hpp index a950e5e41f..d7d3d58080 100644 --- a/src/batched/KokkosBatched_Vector_SIMD.hpp +++ b/src/batched/KokkosBatched_Vector_SIMD.hpp @@ -702,6 +702,9 @@ namespace KokkosBatched { enum : int { vector_length = 8 }; typedef __m512d data_type __attribute__ ((aligned(64))); + inline + static const char* label() { return "AVX512"; } + template friend class Vector; diff --git a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp index 9e1393eb5a..296c424b3c 100644 --- a/src/blas/impl/KokkosBlas1_nrm1_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm1_impl.hpp @@ -52,10 +52,10 @@ namespace KokkosBlas { namespace Impl { // -// nrm1_squared +// nrm1 // -/// \brief 2-norm (squared) functor for single vectors. +/// \brief 1-norm functor for single vectors. /// /// \tparam RV 0-D output View /// \tparam XV 1-D input View @@ -63,12 +63,12 @@ namespace Impl { template struct V_Nrm1_Functor { - typedef typename XV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type; + typedef typename XV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef typename XAT::mag_type value_type; + typedef Kokkos::ArithTraits MAT; typename XV::const_type m_x; @@ -94,12 +94,13 @@ struct V_Nrm1_Functor KOKKOS_INLINE_FUNCTION void operator() (const size_type& i, value_type& sum) const { - sum += IPT::norm (m_x(i)); + xvalue_type val = m_x(i); + sum += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } KOKKOS_INLINE_FUNCTION void init (value_type& update) const { - update = AT::zero (); + update = MAT::zero (); } KOKKOS_INLINE_FUNCTION void @@ -117,7 +118,7 @@ struct V_Nrm1_Functor } }; -/// \brief Column-wise 2-norm functor for multivectors; works for +/// \brief Column-wise 1-norm functor for multivectors; works for /// any layout, but best performance with LayoutRight. /// /// \tparam RV 1-D output View @@ -126,12 +127,12 @@ struct V_Nrm1_Functor template struct MV_Nrm1_Right_FunctorVector { - typedef typename XMV::execution_space execution_space; - typedef SizeType size_type; - typedef typename XMV::non_const_value_type xvalue_type; - typedef Kokkos::Details::InnerProductSpaceTraits IPT; - typedef Kokkos::Details::ArithTraits AT; - typedef typename IPT::mag_type value_type[]; + typedef typename XMV::execution_space execution_space; + typedef SizeType size_type; + typedef typename XMV::non_const_value_type xvalue_type; + typedef Kokkos::ArithTraits XAT; + typedef Kokkos::ArithTraits MAT; + typedef typename XAT::mag_type value_type[]; size_type value_count; typename XMV::const_type m_x; @@ -166,7 +167,8 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - sum[j] += IPT::norm (m_x(i,j)); + xvalue_type val = m_x(i, j); + sum[j] += MAT::abs(XAT::real(val)) + MAT::abs(XAT::imag(val)); } } @@ -181,7 +183,7 @@ struct MV_Nrm1_Right_FunctorVector #pragma vector always #endif for (size_type j = 0; j < numVecs; ++j) { - update[j] = AT::zero (); + update[j] = MAT::zero (); } } diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 59bcf487fb..9e06bc45f2 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -53,7 +53,15 @@ namespace KokkosKernels{ namespace Impl{ -enum ExecSpaceType{Exec_SERIAL, Exec_OMP, Exec_PTHREADS, Exec_QTHREADS, Exec_CUDA, Exec_HIP}; +enum ExecSpaceType { + Exec_SERIAL, + Exec_OMP, + Exec_PTHREADS, + Exec_QTHREADS, + Exec_CUDA, + Exec_HIP, + Exec_SYCL +}; template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ ExecSpaceType exec_space = Exec_SERIAL; @@ -87,6 +95,12 @@ KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type(){ } #endif +#if defined(KOKKOS_ENABLE_SYCL) + if (std::is_same::value) { + exec_space = Exec_SYCL; + } +#endif + #if defined( KOKKOS_ENABLE_QTHREAD) if (std::is_same< Kokkos::Qthread, ExecutionSpace >::value){ exec_space = Exec_QTHREADS; @@ -115,6 +129,14 @@ constexpr KOKKOS_INLINE_FUNCTION bool kk_is_gpu_exec_space +constexpr KOKKOS_INLINE_FUNCTION bool +kk_is_gpu_exec_space() { + return true; +} +#endif + //Host function to determine free and total device memory. //Will throw if execution space doesn't support this. template diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 2e335d4f04..39ac62267c 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -580,12 +580,24 @@ class KokkosKernelsHandle return gs2; } // ---------------------------------------- // - // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson or SpTrsv) + // Specify numer of outer sweeps for two-stage Gauss-Seidel + void set_gs_set_num_outer_sweeps (int num_outer_sweeps) { + auto gs2 = get_twostage_gs_handle(); + gs2->setNumOuterSweeps (num_outer_sweeps); + } + // ---------------------------------------- // + // Specify numer of inner sweeps for two-stage Gauss-Seidel void set_gs_set_num_inner_sweeps (int num_inner_sweeps) { auto gs2 = get_twostage_gs_handle(); gs2->setNumInnerSweeps (num_inner_sweeps); } // ---------------------------------------- // + // Specify damping factor of inner sweeps for two-stage Gauss-Seidel + void set_gs_set_inner_damp_factor (nnz_scalar_t damp_factor) { + auto gs2 = get_twostage_gs_handle(); + gs2->setInnerDampFactor (damp_factor); + } + // ---------------------------------------- // // Specify to use either Two-stage or Classical (i.e., inner Jacobi-Richardson or SpTrsv) void set_gs_twostage (bool two_stage, size_type nrows) { auto gs2 = get_twostage_gs_handle(); @@ -608,6 +620,13 @@ class KokkosKernelsHandle } } } + // ---------------------------------------- // + // Specify to use either Compact or Classical form of recurrence + void set_gs_twostage_compact_form (bool compact_form) { + auto gs2 = get_twostage_gs_handle(); + gs2->setCompactForm (compact_form); + } + void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster) { this->destroy_gs_handle(); diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/common/KokkosKernels_SparseUtils.hpp index 6979f15847..85763608ec 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/common/KokkosKernels_SparseUtils.hpp @@ -1042,9 +1042,11 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; SortCrsMatrixFunctor funct(useRadix, rowmap, entries, values); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if(useRadix) { Kokkos::parallel_for("sort_crs_matrix", Kokkos::RangePolicy(0, numRows), funct); @@ -1054,16 +1056,15 @@ void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, const val //Try to get teamsize to be largest power of 2 not greater than avg entries per row //TODO (probably important for performnce): add thread-level sort also, and use that //for small avg degree. But this works for now. - int teamSize = 1; - lno_t avgDeg = 0; - if(numRows) - avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(teamSize * 2 * 2 <= avgDeg) + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) { - teamSize *= 2; + idealTeamSize *= 2; } - team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelForTag())); + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); } } @@ -1090,9 +1091,11 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) using lno_t = typename entries_t::non_const_value_type; using team_pol = Kokkos::TeamPolicy; bool useRadix = !kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if(numRows == 0) + return; SortCrsGraphFunctor funct(useRadix, rowmap, entries); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; if(useRadix) { Kokkos::parallel_for("sort_crs_graph", Kokkos::RangePolicy(0, numRows), funct); @@ -1103,16 +1106,15 @@ void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) //half the entries per row. 0.5 * #entries is bitonic's parallelism within a row. //TODO (probably important for performnce): add thread-level sort also, and use that //for small avg degree. But this works for now. - int teamSize = 1; - lno_t avgDeg = 0; - if(numRows) - avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while(teamSize * 2 * 2 <= avgDeg) + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while(idealTeamSize < avgDeg / 2) { - teamSize *= 2; + idealTeamSize *= 2; } - team_pol temp(numRows, teamSize); - teamSize = std::min(teamSize, temp.team_size_max(funct, Kokkos::ParallelForTag())); + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); } } diff --git a/src/graph/KokkosGraph_Distance1ColorHandle.hpp b/src/graph/KokkosGraph_Distance1ColorHandle.hpp index 077104ef9f..54a9b6db5b 100644 --- a/src/graph/KokkosGraph_Distance1ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance1ColorHandle.hpp @@ -240,7 +240,14 @@ class GraphColoringHandle { this->coloring_algorithm_type = COLORING_SERIAL; #ifdef VERBOSE - std:cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_SERIAL\n"; +#endif + } + else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) + { + this->coloring_algorithm_type = COLORING_EB; +#ifdef VERBOSE + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_EB\n"; #endif } else if(KokkosKernels::Impl::kk_is_gpu_exec_space()) diff --git a/src/graph/KokkosGraph_Distance2ColorHandle.hpp b/src/graph/KokkosGraph_Distance2ColorHandle.hpp index 39d66b744f..4dc7dd7fe7 100644 --- a/src/graph/KokkosGraph_Distance2ColorHandle.hpp +++ b/src/graph/KokkosGraph_Distance2ColorHandle.hpp @@ -209,14 +209,14 @@ class GraphColorDistance2Handle { this->coloring_algorithm_type = COLORING_D2_SERIAL; #ifdef VERBOSE - std:cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; + std::cout << "Serial Execution Space, Default Algorithm: COLORING_D2_SERIAL\n"; #endif } else { this->coloring_algorithm_type = COLORING_D2_NB_BIT; #ifdef VERBOSE - std:cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; + std::cout << ExecutionSpace::name() << " Execution Space, Default Algorithm: COLORING_D2_NB_BIT\n"; #endif } } diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp index 212cb7c383..def892a167 100644 --- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -80,6 +80,8 @@ void graph_explicit_coarsen( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } @@ -109,6 +111,8 @@ void graph_explicit_coarsen_with_inverse_map( coarse_entries_t mergedEntries; KokkosKernels::Impl::sort_and_merge_graph (coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); + coarseRowmap = mergedRowmap; + coarseEntries = mergedEntries; } } diff --git a/src/graph/KokkosGraph_MIS2.hpp b/src/graph/KokkosGraph_MIS2.hpp index c578a97271..b3098870c5 100644 --- a/src/graph/KokkosGraph_MIS2.hpp +++ b/src/graph/KokkosGraph_MIS2.hpp @@ -94,6 +94,7 @@ graph_mis2_coarsen(const rowmap_t& rowmap, const colinds_t& colinds, typename co if(rowmap.extent(0) <= 1) { //there are no vertices to label + numClusters = 0; return labels_t(); } labels_t mis2 = graph_d2_mis(rowmap, colinds, algo); diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 110756a364..3adda031df 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -2058,32 +2058,39 @@ class GraphColor_VBD:public GraphColor newFrontierSize_; size_type maxColors_; color_view_type colors_; - - functorDeterministicColoring(const_lno_row_view_t rowPtr, - const_lno_nnz_view_t colInd, - nnz_lno_persistent_work_view_t dependency, - nnz_lno_temp_work_view_t frontier, - Kokkos::View frontierSize, - nnz_lno_temp_work_view_t newFrontier, - Kokkos::View newFrontierSize, - size_type maxColors, - color_view_type colors) - : xadj_(rowPtr), adj_(colInd), dependency_(dependency), frontier_(frontier), - frontierSize_(frontierSize), newFrontier_(newFrontier), newFrontierSize_(newFrontierSize), - maxColors_(maxColors), colors_(colors) {} + Kokkos::View bannedColors_; + + functorDeterministicColoring( + const_lno_row_view_t rowPtr, const_lno_nnz_view_t colInd, + nnz_lno_persistent_work_view_t dependency, + nnz_lno_temp_work_view_t frontier, + Kokkos::View frontierSize, + nnz_lno_temp_work_view_t newFrontier, + Kokkos::View newFrontierSize, + size_type maxColors, color_view_type colors) + : xadj_(rowPtr), + adj_(colInd), + dependency_(dependency), + frontier_(frontier), + frontierSize_(frontierSize), + newFrontier_(newFrontier), + newFrontierSize_(newFrontierSize), + maxColors_(maxColors), + colors_(colors), + bannedColors_("KokkosKernels::bannedColors", frontier.size(), + maxColors_) {} KOKKOS_INLINE_FUNCTION void operator() (const size_type frontierIdx) const { typedef typename std::remove_reference< decltype( newFrontierSize_() ) >::type atomic_incr_type; size_type frontierNode = frontier_(frontierIdx); - int* bannedColors = new int[maxColors_]; for(size_type colorIdx= 0; colorIdx < maxColors_; ++colorIdx) { - bannedColors[colorIdx] = 0; + bannedColors_(frontierIdx, colorIdx) = 0; } // Loop over neighbors, find banned colors, decrement dependency and update newFrontier for(size_type neigh = xadj_(frontierNode); neigh < xadj_(frontierNode + 1); ++neigh) { - bannedColors[colors_(adj_(neigh))] = 1; + bannedColors_(frontierIdx, colors_(adj_(neigh))) = 1; // We want to avoid the cost of atomic operations when not needed // so let's check that the node is not already colored, i.e. @@ -2100,12 +2107,11 @@ class GraphColor_VBD:public GraphColor struct GetUnifiedLayout { typedef typename std::conditional< ( (ViewType::rank == 1) && - (std::is_same::value) ) || + (!std::is_same::value) ) || ( (ViewType::rank == 0) ) ,Kokkos::LayoutLeft,typename ViewType::array_layout>::type array_layout; }; diff --git a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp index 217e6f4939..c0e58b19b3 100644 --- a/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_axpby_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ struct axpby_tpl_spec_avail< \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_AXPBY_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index 135dcc6d1b..d3b0fabd71 100644 --- a/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -72,18 +72,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -101,18 +93,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1,1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp index 8df32f62d8..182aba3115 100644 --- a/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_iamax_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_BLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -104,41 +96,23 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex,Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned long, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSBLAS1_IAMAX_TPL_SPEC_AVAIL_CUBLAS( unsigned int, Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp index 3ff2cf4703..5a44212e67 100644 --- a/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm1_tpl_spec_avail.hpp @@ -70,18 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +89,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM1_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp index 3dd558ccd1..3facb0c245 100644 --- a/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrm2_tpl_spec_avail.hpp @@ -57,7 +57,6 @@ struct nrm2_tpl_spec_avail { namespace KokkosBlas { namespace Impl { - // Generic Host side BLAS (could be MKL or whatever) #ifdef KOKKOSKERNELS_ENABLE_TPL_BLAS // double @@ -70,18 +69,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -97,18 +88,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRM2_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp index 1ebf2e2f40..072abff904 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_avail.hpp @@ -70,45 +70,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif - -#endif - -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( SCALAR, LAYOUT, MEMSPACE ) \ -template \ -struct nrminf_tpl_spec_avail< \ -Kokkos::View::mag_type, LAYOUT, Kokkos::HostSpace, \ - Kokkos::MemoryTraits >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1> { enum : bool { value = true }; }; - -#if defined (KOKKOSKERNELS_INST_DOUBLE) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) -KOKKOSBLAS1_NRMINF_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp index 5f7a102e77..b91e81891a 100644 --- a/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_nrminf_tpl_spec_decl.hpp @@ -83,6 +83,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -94,7 +95,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -116,6 +117,7 @@ Kokkos::View, \ typedef Kokkos::View, \ Kokkos::MemoryTraits > XV; \ typedef typename XV::size_type size_type; \ + typedef Kokkos::Details::InnerProductSpaceTraits IPT; \ \ static void nrminf (RV& R, const XV& X) \ { \ @@ -127,7 +129,7 @@ Kokkos::View, \ int N = numElems; \ int one = 1; \ int idx = HostBlas::iamax(N,X.data(),one)-1; \ - R() = X(idx); \ + R() = IPT::norm(X(idx)); \ } else { \ NrmInf::nrminf(R,X); \ } \ @@ -220,176 +222,4 @@ KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_BLAS( Kokkos::LayoutLeft, Kokkos::HostSpace, f #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -#include - -namespace KokkosBlas { -namespace Impl { - -#define KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,double]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIdamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f);; return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIsamax(s.handle, N, X.data(), one, &idx); \ - Kokkos::deep_copy(R, subview(X,idx-1)); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIzamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0, 0.0}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -#define KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( LAYOUT, MEMSPACE, ETI_SPEC_AVAIL ) \ -template \ -struct NrmInf< \ -Kokkos::View >, \ -Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits >, \ -1,true, ETI_SPEC_AVAIL > { \ - \ - typedef Kokkos::View > RV; \ - typedef Kokkos::View*, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits > XV; \ - typedef typename XV::size_type size_type; \ - typedef Kokkos::Details::InnerProductSpaceTraits> IPT; \ - \ - static void nrminf (RV& R, const XV& X) \ - { \ - Kokkos::Profiling::pushRegion("KokkosBlas::nrminf[TPL_CUBLAS,complex]"); \ - const size_type numElems = X.extent(0); \ - if (numElems == 0) { Kokkos::deep_copy (R, 0.0f); return; } \ - if (numElems < static_cast (INT_MAX)) { \ - nrminf_print_specialization(); \ - const int N = static_cast (numElems); \ - constexpr int one = 1; \ - int idx; \ - KokkosBlas::Impl::CudaBlasSingleton & s = KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasIcamax(s.handle, N, reinterpret_cast(X.data()), one, &idx); \ - Kokkos::complex R_cplx_val {0.0f, 0.0f}; \ - Kokkos::View, LAYOUT, Kokkos::HostSpace, Kokkos::MemoryTraits > R_cplx (&R_cplx_val); \ - Kokkos::deep_copy(R_cplx, subview(X,idx-1)); \ - R() = IPT::norm(R_cplx()); \ - } else { \ - NrmInf::nrminf(R,X); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ -}; - -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_DNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_SNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) -KOKKOSBLAS1_CNRMINF_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) - -} -} - -#endif - #endif diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp index 2b92355dd9..114923cca7 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_avail.hpp @@ -71,18 +71,10 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif #endif @@ -99,33 +91,15 @@ Kokkos::View, \ Kokkos::MemoryTraits >, \ 1> { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOS_ENABLE_CUDA_UVM) -#if defined (KOKKOSKERNELS_INST_DOUBLE) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) KOKKOSBLAS1_SCAL_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#endif #endif diff --git a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp index 579d9b81a5..d866702f4f 100644 --- a/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas1_scal_tpl_spec_decl.hpp @@ -376,7 +376,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, f KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaSpace, false) -#if defined (KOKKOS_ENABLE_CUDA_UVM) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_DSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) @@ -388,7 +387,6 @@ KOKKOSBLAS1_ZSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, true) KOKKOSBLAS1_CSCAL_TPL_SPEC_DECL_CUBLAS( Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, false) -#endif } } diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp index c1fd67f9ea..5c6d1734dc 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,39 +93,15 @@ struct gemv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS2_GEMV_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp index 49f6fe743c..3b21c0e8a7 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_avail.hpp @@ -67,39 +67,15 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -117,47 +93,23 @@ struct gemm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_GEMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp index bce7cb5f5d..03e2badcc1 100644 --- a/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_trmm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_BLAS @@ -114,47 +90,23 @@ struct trmm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRMM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS } // namespace Impl diff --git a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp index 93808e3eb0..29a04fb715 100644 --- a/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas3_trsm_tpl_spec_avail.hpp @@ -66,39 +66,15 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::HostSpace) -#endif #endif @@ -114,47 +90,23 @@ struct trsm_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( double, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( float, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS3_TRSM_TPL_SPEC_AVAIL_CUBLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif } diff --git a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp index 917a55fec4..e25a9aa3f1 100644 --- a/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_gesv_tpl_spec_avail.hpp @@ -66,22 +66,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) @@ -114,22 +102,10 @@ struct gesv_tpl_spec_avail< \ Kokkos::MemoryTraits > \ > { enum : bool { value = true }; }; -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_GESV_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif /* #if defined (KOKKOSKERNELS_INST_DOUBLE) \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index fa651f531f..4b602bd765 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -78,55 +78,31 @@ KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( SCALAR , LAYOUTA, MEMSPACE ) #endif // KOKKOSKERNELS_ENABLE_TPL_MAGMA -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_DOUBLE) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( double, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( double, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_FLOAT) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( float, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( float, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_BLAS( Kokkos::complex, Kokkos::LayoutRight, Kokkos::HostSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL_MAGMA( Kokkos::complex, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif } // namespace Impl } // namespace KokkosBlas diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index abeab8c214..a6749be8c8 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -71,117 +71,22 @@ struct spmv_tpl_spec_avail, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT) \ - && defined (KOKKOSKERNELS_INST_OFFSET_INT) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif //CUDA_VERSION by itself cannot determine whether the generic cuSPARSE API is available: //cuSPARSE version 10.1.105 does not have the generic API, but it comes with the same CUDA_VERSION (10010) as 10.1.243 which does. @@ -190,122 +95,52 @@ struct spmv_tpl_spec_avail, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTLEFT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif - -#if defined (KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) \ - && defined (KOKKOSKERNELS_INST_LAYOUTRIGHT) \ - && defined (KOKKOSKERNELS_INST_ORDINAL_INT64_T) \ - && defined (KOKKOSKERNELS_INST_OFFSET_SIZE_T) KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::CudaUVMSpace) -#endif #endif // CUSPARSE >= 10.3 (nested, implies >= 9.0) #endif // CUDA/CUSPARSE >= 9.0? #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ +template <> \ +struct spmv_tpl_spec_avail, Kokkos::MemoryTraits, const int, \ + const SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ +}; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) +#endif + +#endif + // Specialization struct which defines whether a specialization exists template + +namespace KokkosSparse +{ +namespace Impl +{ + +#if (__INTEL_MKL__ > 2017) + //MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() + + inline void mkl_safe_call(int errcode) + { + if(errcode != SPARSE_STATUS_SUCCESS) + throw std::runtime_error("MKL returned non-success error code"); + } + + inline sparse_operation_t mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': + return SPARSE_OPERATION_TRANSPOSE; + case 'H': + return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, + int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, + const float* x, float* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_s_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, + int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, + const double* x, double* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_d_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_c_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*) Avalues)); + MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex8& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + + inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, Kokkos::complex* y) + { + sparse_matrix_t A_mkl; + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + mkl_safe_call(mkl_sparse_z_create_csr(&A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*) Avalues)); + MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex16& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), beta_mkl, reinterpret_cast(y))); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; +#endif + +#if (__INTEL_MKL__ == 2017) + //MKL 2017: use old interface: mkl_?csrmv + inline char mode_kk_to_mkl(char mode_kk) + { + switch(toupper(mode_kk)) + { + case 'N': + return 'N'; + case 'T': + return 'T'; + case 'H': + return 'C'; + default:; + } + throw std::invalid_argument("Invalid mode for MKL (should be one of N, T, H)"); + } + + + //void mkl_scsrmv(const char *transa, const MKL_INT *m, const MKL_INT *k, const float *alpha, const char *matdescra, const float *val, const MKL_INT *indx, const MKL_INT *pn trb, const MKL_INT *pntre, const float *x, const float *beta, float *y); + inline void spmv_mkl(char mode, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y) + { + mkl_scsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, double alpha, double beta, int m, int n, const int* Arowptrs, const int* Aentries, const double* Avalues, const double* x, double* y) + { + mkl_dcsrmv(&mode, &m, &n, &alpha, "G**C", Avalues, Aentries, Arowptrs, Arowptrs + 1, x, &beta, y); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex8* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex8* x_mkl = reinterpret_cast(x); + MKL_Complex8* y_mkl = reinterpret_cast(y); + mkl_ccsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + + inline void spmv_mkl(char mode, Kokkos::complex alpha, Kokkos::complex beta, int m, int n, const int* Arowptrs, const int* Aentries, const Kokkos::complex* Avalues, const Kokkos::complex* x, Kokkos::complex* y) + { + const MKL_Complex16* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex16* Avalues_mkl = reinterpret_cast(Avalues); + const MKL_Complex16* x_mkl = reinterpret_cast(x); + MKL_Complex16* y_mkl = reinterpret_cast(y); + mkl_zcsrmv(&mode, &m, &n, alpha_mkl, "G**C", Avalues_mkl, Aentries, Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); + } + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template<> \ + struct SPMV, Kokkos::MemoryTraits, int const, \ + SCALAR const*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + SCALAR*, Kokkos::LayoutLeft, Kokkos::Device, Kokkos::MemoryTraits, \ + true, COMPILE_LIBRARY> { \ + \ + using device_type = Kokkos::Device; \ + using AMatrix = CrsMatrix, int const>; \ + using XVector = Kokkos::View>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv (const Controls&, \ + const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, \ + const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL," + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), A.numCols(), \ + A.graph.row_map.data(), A.graph.entries.data(), A.values.data(), x.data(), y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; +#endif + +#ifdef KOKKOS_ENABLE_SERIAL + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, true) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP + KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) + KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, true) +#endif + +#undef KOKKOSSPARSE_SPMV_MKL +} +} +#endif + #endif // KOKKOSPARSE_SPMV_TPL_SPEC_DECL_HPP_ diff --git a/src/sparse/KokkosSparse_CrsMatrix.hpp b/src/sparse/KokkosSparse_CrsMatrix.hpp index d866a63601..d734d9ac3a 100644 --- a/src/sparse/KokkosSparse_CrsMatrix.hpp +++ b/src/sparse/KokkosSparse_CrsMatrix.hpp @@ -411,7 +411,7 @@ class CrsMatrix { typedef SizeType size_type; //! Type of a host-memory mirror of the sparse matrix. - typedef CrsMatrix HostMirror; + typedef CrsMatrix HostMirror; //! Type of the graph structure of the sparse matrix. typedef Kokkos::StaticCrsGraph StaticCrsGraphType; //! Type of the graph structure of the sparse matrix - consistent with Kokkos. @@ -473,13 +473,13 @@ class CrsMatrix { {} //! Copy constructor (shallow copy). - template + template KOKKOS_INLINE_FUNCTION - CrsMatrix (const CrsMatrix & B) : + CrsMatrix (const CrsMatrix & B) : graph (B.graph.entries, B.graph.row_map), values (B.values), dev_config (B.dev_config), @@ -494,14 +494,36 @@ class CrsMatrix { //as the constructor of StaticCrsGraph does not allow copy from non const version. } + //! Deep copy constructor (can cross spaces) + template + CrsMatrix (const std::string&, + const CrsMatrix& mat_) + { + typename row_map_type::non_const_type rowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap"), mat_.graph.row_map.extent(0)); + index_type cols(Kokkos::ViewAllocateWithoutInitializing("cols"), mat_.nnz()); + values = values_type(Kokkos::ViewAllocateWithoutInitializing("values"), mat_.nnz()); + Kokkos::deep_copy(rowmap, mat_.graph.row_map); + Kokkos::deep_copy(cols, mat_.graph.entries); + Kokkos::deep_copy(values, mat_.values); + + numCols_ = mat_.numCols(); + graph = StaticCrsGraphType(cols, rowmap); + +#ifdef KOKKOS_USE_CUSPARSE + cusparseCreate (&cusparse_handle); + cusparseCreateMatDescr (&cusparse_descr); +#endif // KOKKOS_USE_CUSPARSE + } + /// \brief Construct with a graph that will be shared. /// /// Allocate the values array for subsquent fill. - CrsMatrix (const std::string& arg_label, - const staticcrsgraph_type& arg_graph) : - graph (arg_graph), - values (arg_label, arg_graph.entries.extent(0)), - numCols_ (maximum_entry (arg_graph) + 1) + template + CrsMatrix (const std::string& label, + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), + values (label, graph_.entries.extent(0)), + numCols_ (maximum_entry (graph_) + 1) {} /// \brief Constructor that copies raw arrays of host data in @@ -609,11 +631,12 @@ class CrsMatrix { /// \param rows [in/out] The row map (containing the offsets to the /// data in each row). /// \param cols [in/out] The column indices. - CrsMatrix (const std::string& /* label */, + template + CrsMatrix (const std::string&, const OrdinalType& ncols, const values_type& vals, - const staticcrsgraph_type& graph_) : - graph (graph_), + const Kokkos::StaticCrsGraph& graph_) : + graph (graph_.entries, graph_.row_map), values (vals), numCols_ (ncols) { @@ -888,7 +911,6 @@ ctor_impl (const std::string &label, row_lengths[i] = rows[i + 1] - rows[i]; } - str = label; graph = Kokkos::create_staticcrsgraph (str.append (".graph"), row_lengths); typename values_type::HostMirror h_values = Kokkos::create_mirror_view (values); typename index_type::HostMirror h_entries = Kokkos::create_mirror_view (graph.entries); diff --git a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp index fd4a9b58d9..9176809115 100644 --- a/src/sparse/KokkosSparse_gauss_seidel_handle.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel_handle.hpp @@ -573,8 +573,13 @@ namespace KokkosSparse{ nrhs (1), direction (GS_SYMMETRIC), two_stage (true), - num_inner_sweeps (1) - {} + compact_form (false), + num_inner_sweeps (1), + num_outer_sweeps (1) + { + const scalar_t one (1.0); + inner_omega = one; + } // Sweep direction void setSweepDirection (GSDirection direction_) { @@ -592,6 +597,22 @@ namespace KokkosSparse{ return this->two_stage; } + // specify whether to use compact form of recurrence + void setCompactForm (bool compact_form_) { + this->compact_form = compact_form_; + } + bool isCompactForm () { + return this->compact_form; + } + + // Number of outer sweeps + void setNumOuterSweeps (int num_outer_sweeps_) { + this->num_outer_sweeps = num_outer_sweeps_; + } + int getNumOuterSweeps () { + return this->num_outer_sweeps; + } + // Number of inner sweeps void setNumInnerSweeps (int num_inner_sweeps_) { this->num_inner_sweeps = num_inner_sweeps_; @@ -600,27 +621,57 @@ namespace KokkosSparse{ return this->num_inner_sweeps; } - // workspaces + // Inner damping factor + void setInnerDampFactor (scalar_t inner_omega_) { + this->inner_omega = inner_omega_; + } + scalar_t getInnerDampFactor () { + return this->inner_omega; + } + + // Workspaces + // > diagonal (inverse) void setD (values_view_t D_) { this->D = D_; } values_view_t getD () { return this->D; } - + // > Lower part of diagonal block void setL (crsmat_t L) { this->crsmatL = L; } crsmat_t getL () { return this->crsmatL; } - + // > Upper part of diagonal block void setU (crsmat_t U) { this->crsmatU = U; } crsmat_t getU () { return this->crsmatU; } + // > Complement of U + void setLa (crsmat_t La) { + this->crsmatLa = La; + } + crsmat_t getLa () { + return this->crsmatLa; + } + // > Complement of L + void setUa (crsmat_t Ua) { + this->crsmatUa = Ua; + } + crsmat_t getUa () { + return this->crsmatUa; + } + // > diagonal (not-inverse) + void setDa (values_view_t Da_) { + this->Da = Da_; + } + values_view_t getDa () { + return this->Da; + } void initVectors (int nrows_, int nrhs_) { if (this->nrows != nrows_ || this->nrhs != nrhs_) { @@ -650,6 +701,11 @@ namespace KokkosSparse{ values_view_t D; crsmat_t crsmatL; crsmat_t crsmatU; + // > complements for compact form of recurrence + // where La = A - U and Ua = A - L + values_view_t Da; + crsmat_t crsmatLa; + crsmat_t crsmatUa; // > residual vector for outer GS, Rk = B-A*Xk vector_view_t localR; @@ -661,7 +717,10 @@ namespace KokkosSparse{ // solver parameters GSDirection direction; bool two_stage; + bool compact_form; int num_inner_sweeps; + int num_outer_sweeps; + scalar_t inner_omega; }; // ------------------------------------- } diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp index 72b7000401..ef4abfc20b 100644 --- a/src/sparse/KokkosSparse_spgemm.hpp +++ b/src/sparse/KokkosSparse_spgemm.hpp @@ -53,7 +53,6 @@ namespace KokkosSparse { template void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { - using graph_type = typename CMatrix::staticcrsgraph_type; using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; @@ -77,8 +76,7 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, c_nnz_size); } - graph_type graphC(entriesC, row_mapC); - C = CMatrix("matrix", graphC); + C = CMatrix("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, row_mapC, entriesC); } template diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index ca83cb217b..aca370e476 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -157,12 +157,23 @@ spmv (KokkosKernels::Experimental::Controls controls, #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE //cuSPARSE does not support the conjugate mode (C), and cuSPARSE 9 only supports the normal (N) mode. + if(std::is_same::value || + std::is_same::value) + { #if (9000 <= CUDA_VERSION) - useFallback = useFallback || (mode[0] != NoTranspose[0]); + useFallback = useFallback || (mode[0] != NoTranspose[0]); #endif #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) - useFallback = useFallback || (mode[0] == Conjugate[0]); + useFallback = useFallback || (mode[0] == Conjugate[0]); +#endif + } #endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if(std::is_same::value) + { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } #endif if(useFallback) diff --git a/src/sparse/KokkosSparse_sptrsv.hpp b/src/sparse/KokkosSparse_sptrsv.hpp index aafd365590..2ac041201e 100644 --- a/src/sparse/KokkosSparse_sptrsv.hpp +++ b/src/sparse/KokkosSparse_sptrsv.hpp @@ -108,11 +108,17 @@ namespace Experimental { Kokkos::MemoryTraits > Entries_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif RowMap_Internal rowmap_i = rowmap; Entries_Internal entries_i = entries; KokkosSparse::Impl::SPTRSV_SYMBOLIC::sptrsv_symbolic (&tmp_handle, rowmap_i, entries_i); + #ifdef KK_TRISOLVE_TIMERS + std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template > Values_Internal; + #ifdef KK_TRISOLVE_TIMERS + Kokkos::Timer timer_sptrsv; + #endif auto sptrsv_handle = handle->get_sptrsv_handle(); if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { RowMap_Internal rowmap_i = rowmap; @@ -189,7 +198,9 @@ namespace Experimental { else { KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries); } - + #ifdef KK_TRISOLVE_TIMERS + std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() << std::endl; + #endif } // sptrsv_symbolic template ; using integer_view_host_t = Kokkos::View; - using workspace_t = typename Kokkos::View; + using workspace_t = typename Kokkos::View>; // using host_crsmat_t = KokkosSparse::CrsMatrix; - using crsmat_t = KokkosSparse::CrsMatrix; + using crsmat_t = KokkosSparse::CrsMatrix, void, size_type>; // using host_graph_t = typename host_crsmat_t::StaticCrsGraphType; diff --git a/src/sparse/KokkosSparse_sptrsv_superlu.hpp b/src/sparse/KokkosSparse_sptrsv_superlu.hpp index 5e83f83f80..e8d0746f9b 100644 --- a/src/sparse/KokkosSparse_sptrsv_superlu.hpp +++ b/src/sparse/KokkosSparse_sptrsv_superlu.hpp @@ -296,6 +296,7 @@ void sptrsv_symbolic( #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE double time_seconds = tic.seconds (); std::cout << " Conversion Time (from SuperLU to CSR): " << time_seconds << std::endl; + tic.reset(); #endif // =================================================================== @@ -313,6 +314,10 @@ void sptrsv_symbolic( sptrsv_supernodal_symbolic (nsuper, supercols, etree, graphL_host, kernelHandleL, graphU_host, kernelHandleU); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " SpTRSV Supernodal Symbolic Time : " << time_seconds << std::endl; + #endif } diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index e73837e3a4..f71c7d6109 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -450,6 +450,10 @@ void check_supernode_sizes(const char *title, int n, int nsuper, input_size_type template host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const input_size_type *nb) { + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + double time_seconds = 0.0; + Kokkos::Timer timer; + #endif using size_type = typename graph_t::size_type; using cols_view_host_t = typename host_graph_t::entries_type::non_const_type; @@ -476,13 +480,19 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu // count non-empty supernodal blocks row_map_view_host_t hr ("rowmap_view", nsuper+1); integer_view_host_t check ("check", nsuper); + integer_view_host_t idxs ("idxs", nsuper); Kokkos::deep_copy (hr, 0); Kokkos::deep_copy (check, 0); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + timer.reset (); + #endif int nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -493,10 +503,16 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu nblocks ++; // count blocks per row for col_major hr (s2+1) ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } // reset check - Kokkos::deep_copy (check, 0); + //Kokkos::deep_copy (check, 0); + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; + } } cols_view_host_t hc ("colmap_view", nblocks); @@ -506,11 +522,18 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hr (s+1) += hr (s); } } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: count blocks : " << time_seconds << std::endl; + timer.reset (); + #endif nblocks = 0; for (int s = 0; s < nsuper; s++) { int j1 = nb[s]; int j2 = j1+1; // based on the first row + + size_type nidxs = 0; for (size_type i = row_map_host (j1); i < row_map_host (j2); i++) { int s2 = map (entries_host (i)); // supernodal blocks may not be filled with zeros @@ -525,19 +548,25 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu hc (nblocks) = s2; } nblocks ++; + // keep track of non-zero block ids + idxs (nidxs) = s2; + nidxs ++; } } if (!col_major) { hr (s+1) = nblocks; } // reset check - if (!col_major) { + /*if (!col_major) { for (size_type s2 = hr(s); s2 < hr(s+1); s2++) { check (hc(s2)) = 0; } } else { // NOTE: nonzero supernodes in s-th col are not stored Kokkos::deep_copy (check, 0); + }*/ + for (size_type i = 0; i < nidxs; i++) { + check (idxs(i)) = 0; } } // fix hr @@ -547,10 +576,21 @@ generate_supernodal_graph(bool col_major, graph_t &graph, int nsuper, const inpu } hr (0) = 0; } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: compress graph : " << time_seconds + << " (col_major = " << col_major << ")" << std::endl; + timer.reset (); + #endif + // sort column ids per row for (int s = 0; s < nsuper; s++) { std::sort(&(hc (hr (s))), &(hc (hr (s+1)))); } + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = timer.seconds (); + std::cout << " > Generate Supernodal Graph: sort graph : " << time_seconds << std::endl << std::endl; + #endif host_graph_t static_graph (hc, hr); return static_graph; @@ -1018,17 +1058,32 @@ void sptrsv_supernodal_symbolic( // save the supernodal info in the handles for L/U solves handleL->set_supernodes (nsuper, supercols_view, etree); handleU->set_supernodes (nsuper, supercols_view, etree); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Deep-copy graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif if (handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG || handleL->get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { // generate supernodal graphs for DAG scheduling auto supL = generate_supernodal_graph (!col_majorL, graphL_host, nsuper, supercols); auto supU = generate_supernodal_graph ( col_majorU, graphU_host, nsuper, supercols); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute Supernodal Graph Time: " << time_seconds << std::endl; + tic.reset (); + #endif auto dagL = generate_supernodal_dag (nsuper, supL, supU); auto dagU = generate_supernodal_dag (nsuper, supU, supL); handleL->set_supernodal_dag (dagL); handleU->set_supernodal_dag (dagU); + #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE + time_seconds = tic.seconds (); + std::cout << " Compute DAG Time: " << time_seconds << std::endl; + tic.reset (); + #endif } // =================================================================== diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 4df18eb833..89c6f81a2c 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -97,8 +97,10 @@ void level_sched ( IlukHandle& thandle, level_ptr(level_list(i)-1) += 1; } - for ( size_type i = nlevels-1; i > 0; --i ) { - level_ptr(i) = level_ptr(i-1); + if (nlevels>0) {//note: to avoid wrapping around to the max of size_t when nlevels = 0. + for ( size_type i = nlevels-1; i > 0; --i ) { + level_ptr(i) = level_ptr(i-1); + } } level_ptr(0) = 0; diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 7b91f95e09..f06e2fb9d9 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -318,12 +318,88 @@ spmv_beta_no_transpose (const KokkosKernels::Experimental::Controls& controls, typename YVector::const_value_type& beta, const YVector& y) { - typedef typename AMatrix::ordinal_type ordinal_type; + typedef typename AMatrix::non_const_ordinal_type ordinal_type; typedef typename AMatrix::execution_space execution_space; if (A.numRows () <= static_cast (0)) { return; } +#if defined(KOKKOS_ENABLE_SERIAL) + if(std::is_same::value) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef typename AMatrix::non_const_size_type size_type; + + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::non_const_value_type *__restrict__ y_ptr = y.data(); + typename XVector::const_value_type *__restrict__ x_ptr = x.data(); + + const typename YVector::non_const_value_type zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + if (dobeta == 0) { + /// not working with kkosDev2_CUDA110_GCC92_cpp17/ + ///memset(y_ptr, 0, sizeof(typename YVector::value_type)*nrow); + for (int i=0;i::value) && @@ -418,45 +494,113 @@ spmv_beta_transpose (typename YVector::const_value_type& alpha, KokkosBlas::scal (y, beta, y); } - // Assuming that no row contains duplicate entries, NNZPerRow - // cannot be more than the number of columns of the matrix. Thus, - // the appropriate type is ordinal_type. - const ordinal_type NNZPerRow = A.nnz () / A.numRows (); +#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_THREADS) + { + int impl_thread_pool_size(0); +#if defined(KOKKOS_ENABLE_SERIAL) + if (std::is_same::value) + impl_thread_pool_size = 1; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::OpenMP::impl_thread_pool_size(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + if (std::is_same::value) + impl_thread_pool_size = Kokkos::Threads::impl_thread_pool_size(); +#endif - int vector_length = 1; - bool use_teams = KokkosKernels::Impl::kk_is_gpu_exec_space(); - int max_vector_length = 1; + if (impl_thread_pool_size == 1) { + /// serial impl + typedef typename AMatrix::non_const_value_type value_type; + typedef Kokkos::Details::ArithTraits ATV; + const size_type *__restrict__ row_map_ptr = A.graph.row_map.data(); + const ordinal_type *__restrict__ col_idx_ptr = A.graph.entries.data(); + const value_type *__restrict__ values_ptr = A.values.data(); + + typename YVector::value_type *__restrict__ y_ptr = y.data(); + typename XVector::value_type *__restrict__ x_ptr = x.data(); + + const typename YVector::non_const_value_type zero(0); + const ordinal_type nrow = A.numRows(); + if (alpha == zero) { + /// do nothing + } else { + for (int i=0;i(); + int max_vector_length = 1; #ifdef KOKKOS_ENABLE_CUDA - if(std::is_same::value) - max_vector_length = 32; + if(std::is_same::value) + max_vector_length = 32; #endif #ifdef KOKKOS_ENABLE_HIP - if(std::is_same::value) - max_vector_length = 64; + if(std::is_same::value) + max_vector_length = 64; #endif - if(use_teams) { - while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) - vector_length*=2; - } - - typedef SPMV_Transpose_Functor OpType; - - typename AMatrix::const_ordinal_type nrow = A.numRows(); - - OpType op (alpha, A, x, y); - - if(use_teams) { - const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); - const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); - const ordinal_type rows_per_team = rows_per_thread * team_size; - op.rows_per_team = rows_per_team; - const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > - ( nteams , team_size , vector_length ) , op ); - } - else { - Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > - ( 0 , nrow ) , op ); + if(use_teams) { + while( (vector_length*2*3 <= NNZPerRow) && (vector_length < max_vector_length) ) + vector_length*=2; + } + + typedef SPMV_Transpose_Functor OpType; + + typename AMatrix::const_ordinal_type nrow = A.numRows(); + + OpType op (alpha, A, x, y); + + if(use_teams) { + const ordinal_type rows_per_thread = RowsPerThread (NNZPerRow); + const ordinal_type team_size = Kokkos::TeamPolicy(rows_per_thread, Kokkos::AUTO, vector_length).team_size_recommended(op, Kokkos::ParallelForTag()); + const ordinal_type rows_per_team = rows_per_thread * team_size; + op.rows_per_team = rows_per_team; + const size_type nteams = (nrow+rows_per_team-1)/rows_per_team; + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::TeamPolicy< execution_space > + ( nteams , team_size , vector_length ) , op ); + } + else { + Kokkos::parallel_for("KokkosSparse::spmv", Kokkos::RangePolicy< execution_space > + ( 0 , nrow ) , op ); + } } } diff --git a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp index 72c8a969fe..7ac4936f51 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl_omp.hpp @@ -69,7 +69,7 @@ void spmv_raw_openmp_no_transpose(typename YVector::const_value_type& s_a, AMatr typename YVector::const_value_type zero = 0; #pragma omp parallel { -#ifdef KOKKOS_COMPILER_INTEL +#if defined(KOKKOS_COMPILER_INTEL) && !defined(__clang__) __assume_aligned(x_ptr, 64); __assume_aligned(y_ptr, 64); #endif diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 271d8b2396..0332b82e49 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -617,7 +617,7 @@ struct SparseTriSupernodalSpMVFunctor using scalar_t = typename LHSType::non_const_value_type; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; int flag; long node_count; @@ -698,7 +698,7 @@ struct LowerTriSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using range_type = Kokkos::pair; @@ -875,7 +875,7 @@ struct UpperTriSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using SupernodeView = typename Kokkos::View; @@ -1028,7 +1028,7 @@ struct UpperTriTranSupernodalFunctor using scalar_t = typename ValuesType::non_const_value_type; using integer_view_t = Kokkos::View; - using work_view_t = typename Kokkos::View; + using work_view_t = typename Kokkos::View>; using range_type = Kokkos::pair; @@ -2656,6 +2656,10 @@ cudaProfilerStop(); size_type node_count = 0; + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { { size_type lvl_nodes = hnodes_per_level(lvl); @@ -2716,7 +2720,6 @@ cudaProfilerStart(); thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_DAG) { - //#define profile_supernodal_etree #ifdef profile_supernodal_etree size_t flops = 0; Kokkos::Timer timer; @@ -2884,6 +2887,13 @@ cudaProfilerStop(); } // scope for if-block } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + std::cout << " + SpTrsv(lower) time: " << sptrsv_time_seconds << std::endl << std::endl; + #endif } // end lower_tri_solve @@ -2954,6 +2964,10 @@ cudaProfilerStop(); size_type node_count = 0; // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead + #ifdef profile_supernodal_etree + Kokkos::Timer sptrsv_timer; + sptrsv_timer.reset(); + #endif for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { size_type lvl_nodes = hnodes_per_level(lvl); @@ -3279,6 +3293,13 @@ cudaProfilerStop(); #endif } // end if } // end for lvl + #ifdef profile_supernodal_etree + Kokkos::fence(); + double sptrsv_time_seconds = sptrsv_timer.seconds (); + std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; + std::cout <<" + Execution space : " << execution_space::name () << std::endl; + std::cout << " + Memory space : " << memory_space::name () << std::endl; + #endif } // end upper_tri_solve diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 19694063f0..45ebfe9e00 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -167,6 +167,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -397,6 +398,23 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con work_offset_host (s) = 0; } } else { + //#define profile_supernodal_etree + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif + /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); integer_view_host_t check ("check", nsuper); @@ -421,22 +439,6 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con size_type num_done = 0; size_type level = 0; - //#define profile_supernodal_etree - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -564,9 +566,15 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con std::cout << " * numer of leaves: min = " << min_nleave << "\t max = " << max_nleave << "\t avg = " << tot_nleave/level << std::endl; std::cout << " * level = " << level << std::endl; #endif + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of level equal to be the number of supernodal columns thandle.set_num_levels (level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -590,6 +598,10 @@ void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, con Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif + thandle.set_symbolic_complete(); } #endif @@ -604,6 +616,7 @@ template < class TriSolveHandle, class RowMapType, class EntriesType > void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; + Kokkos::Timer timer; #endif using namespace KokkosSparse::Experimental; @@ -826,6 +839,21 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co else { /* schduling from bottom to top (as for L-solve) * * then reverse it for U-solve */ + #ifdef profile_supernodal_etree + // min, max, tot size of supernodes + signed_integral_t max_nsrow = 0; + signed_integral_t min_nsrow = 0; + signed_integral_t tot_nsrow = 0; + + signed_integral_t max_nscol = 0; + signed_integral_t min_nscol = 0; + signed_integral_t tot_nscol = 0; + + // min, max, tot num of leaves + signed_integral_t max_nleave = 0; + signed_integral_t min_nleave = 0; + signed_integral_t tot_nleave = 0; + #endif /* initialize the ready tasks with leaves */ const int *parents = thandle.get_etree_parents (); @@ -860,21 +888,6 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co size_type num_done = 0; size_type level = 0; - #ifdef profile_supernodal_etree - // min, max, tot size of supernodes - signed_integral_t max_nsrow = 0; - signed_integral_t min_nsrow = 0; - signed_integral_t tot_nsrow = 0; - - signed_integral_t max_nscol = 0; - signed_integral_t min_nscol = 0; - signed_integral_t tot_nscol = 0; - - // min, max, tot num of leaves - signed_integral_t max_nleave = 0; - signed_integral_t min_nleave = 0; - signed_integral_t tot_nleave = 0; - #endif while (num_done < nsuper) { nodes_per_level (level) = 0; // look for ready-tasks @@ -1013,10 +1026,16 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co diag_kernel_type_by_level (level) = 3; } } + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + scheduling time = " << timer.seconds() << std::endl; + #endif // Set number of levels thandle.set_num_levels (num_level); } + #ifdef TRISOLVE_SYMB_TIMERS + timer.reset(); + #endif // workspace size if (thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV || thandle.get_algorithm () == SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG) { @@ -1039,6 +1058,9 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co Kokkos::deep_copy (dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy (dnodes_per_level, nodes_per_level); Kokkos::deep_copy (dlevel_list, level_list); + #ifdef TRISOLVE_SYMB_TIMERS + std::cout << " + workspace time = " << timer.seconds() << std::endl; + #endif thandle.set_symbolic_complete (); } diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index f78a7e0cd4..7b2db4ba9a 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -61,7 +61,7 @@ #include "KokkosSparse_gauss_seidel_handle.hpp" -#define KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV +//#define KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS namespace KokkosSparse{ namespace Impl{ @@ -103,6 +103,10 @@ namespace KokkosSparse{ input_size_t>; using input_graph_t = typename input_crsmat_t::StaticCrsGraphType; using single_vector_view_t = Kokkos::View; + using internal_vector_view_t = typename TwoStageGaussSeidelHandleType::vector_view_t; + + using ST = Kokkos::Details::ArithTraits; + using mag_t = typename ST::mag_type; private: HandleType *handle; @@ -132,13 +136,11 @@ namespace KokkosSparse{ struct Tag_countNnzL{}; struct Tag_countNnzU{}; // tag for inserting entries - struct Tag_entriesL{}; - struct Tag_entriesU{}; struct Tag_entriesLU{}; // tag for inserting values - struct Tag_valuesL{}; - struct Tag_valuesU{}; struct Tag_valuesLU{}; + // tag for computing residual norm + struct Tag_normR{}; template ::one (); - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) < i) { - values (nnz) = values_view (k); - nnz ++; - } else if (column_view (k) == i) { - if (two_stage) { - if (diagos_given) { - diags (i) = d_invert_view (i); - } else { - diags (i) = one / values_view (k); - } - } else { - values (nnz) = values_view (k); - nnz ++; - } - } - } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - if (two_stage) { - for (size_type k = row_map (i); k < nnz; k++) { - values (k) *= diags (i); + if (i == 0) { + row_map_a (0) = 0; } } - #endif + nnz += nnz_i; } - // ------------------------------------------------------- // // functor for counting nnzU (with parallel_reduce) KOKKOS_INLINE_FUNCTION @@ -369,55 +384,18 @@ namespace KokkosSparse{ if (i == 0) { row_map (0) = 0; } - nnz += nnz_i; - } - - // functor for storing entriesU (with parallel_for) - KOKKOS_INLINE_FUNCTION - void operator()(const Tag_entriesU&, const ordinal_t i) const - { - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) > i && column_view (k) < num_rows) { - entries (nnz) = column_view (k); - nnz ++; - } else if(!two_stage && column_view (k) == i) { - entries (nnz) = column_view (k); - nnz ++; - } - } - } - - // functor for storing valuesU (with parallel_for) - KOKKOS_INLINE_FUNCTION - void operator()(const Tag_valuesU&, const ordinal_t i) const - { - const_scalar_t one = Kokkos::Details::ArithTraits::one (); - ordinal_t nnz = row_map (i); - for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { - if (column_view (k) == i) { - if (two_stage) { - if (diagos_given) { - diags (i) = d_invert_view (i); - } else { - diags (i) = one / values_view (k); - } - } else { - values (nnz) = values_view (k); - nnz ++; - } - } else if (column_view (k) > i && column_view (k) < num_rows) { - values (nnz) = values_view (k); - nnz ++; + if (compact_form) { + // complement of U+D + row_map_a (i+1) = (rowmap_view (i+1) - rowmap_view (i)) - nnz_i; + if (two_stage) { + // two-stage iterates with U (no D) + row_map_a (i+1) --; } - } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - if (two_stage) { - for (size_type k = row_map (i); k < nnz; k++) { - values (k) *= diags (i); + if (i == 0) { + row_map_a (0) = 0; } } - #endif + nnz += nnz_i; } // ------------------------------------------------------- // @@ -427,6 +405,12 @@ namespace KokkosSparse{ { ordinal_t nnzL = row_map (i); ordinal_t nnzU = row_map2 (i); + ordinal_t nnzLa = 0; + ordinal_t nnzUa = 0; + if (compact_form) { + nnzLa = row_map_a (i); + nnzUa = row_map_a2 (i); + } if (!two_stage) { // NOTE: Kokkos' sptrsv assumes diagonal of U to be at the start entries2 (nnzU) = i; @@ -434,11 +418,32 @@ namespace KokkosSparse{ } for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { if (column_view (k) < i) { + // L entries (nnzL) = column_view (k); nnzL ++; - } else if (column_view (k) > i && column_view (k) < num_rows) { - entries2 (nnzU) = column_view (k); - nnzU ++; + if (compact_form) { + // complement of U+D + entries_a (nnzLa) = column_view (k); + nnzLa ++; + } + } else if (column_view (k) > i) { + if (column_view (k) < num_rows) { + // U + entries2 (nnzU) = column_view (k); + nnzU ++; + if (compact_form) { + // complement of L+D + entries_a2 (nnzUa) = column_view (k); + nnzUa ++; + } + } else if (compact_form) { + // complement of U+D + entries_a (nnzLa) = column_view (k); + nnzLa ++; + // complement of L+D + entries_a2 (nnzUa) = column_view (k); + nnzUa ++; + } } } if (!two_stage) { @@ -455,6 +460,12 @@ namespace KokkosSparse{ const_scalar_t one = Kokkos::Details::ArithTraits::one (); ordinal_t nnzL = row_map (i); ordinal_t nnzU = row_map2 (i); + ordinal_t nnzLa = 0; + ordinal_t nnzUa = 0; + if (compact_form) { + nnzLa = row_map_a (i); + nnzUa = row_map_a2 (i); + } if (!two_stage) { // Kokkos' sptrsv assumes diagonal U to come at the start, so increment nnzU nnzU ++; @@ -464,6 +475,11 @@ namespace KokkosSparse{ // save L (without diag) values (nnzL) = values_view (k); nnzL ++; + if (compact_form) { + // complement of U+D + values_a (nnzLa) = values_view (k); + nnzLa ++; + } } else if (column_view (k) == i) { // save D if (diagos_given) { @@ -473,10 +489,27 @@ namespace KokkosSparse{ // as original diags (i) = values_view (k); } - } else if (column_view (k) < num_rows) { - // save U (without diag) - values2 (nnzU) = values_view (k); - nnzU ++; + if (compact_form) { + diags_a (i) = values_view (k); + } + } else { + if (column_view (k) < num_rows) { + // save U (without diag) + values2 (nnzU) = values_view (k); + nnzU ++; + if (compact_form) { + // complement of L+D + values_a2 (nnzUa) = values_view (k); + nnzUa ++; + } + } else if (compact_form) { + // complement of U+D + values_a (nnzLa) = values_view (k); + nnzLa ++; + // complement of L+D + values_a2 (nnzUa) = values_view (k); + nnzUa ++; + } } } if (!two_stage) { @@ -485,27 +518,50 @@ namespace KokkosSparse{ nnzU = row_map2 (i); if (diagos_given) { values2 (nnzU) = one / diags (i); - values (nnzL) = one / diags (i); + values (nnzL) = one / diags (i); } else { values2 (nnzU) = diags (i); - values (nnzL) = diags (i); + values (nnzL) = diags (i); } } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (two_stage) { if (!diagos_given) { // when diag is provided, it is already provided as inverse diags (i) = one / diags (i); } - // compute inv(D)*L + // compute inv(D)*L (apply row-scaling to valueL) for (size_type k = row_map (i); k < row_map (i+1); k++) { values (k) *= diags (i); } + // compute inv(D)*U (apply row-scaling to valueU) for (size_type k = row_map2 (i); k < row_map2 (i+1); k++) { values2 (k) *= diags (i); } } - #endif + } + + // ------------------------------------------------------- // + // functor for computing residual norm (with parallel_reduce) + KOKKOS_INLINE_FUNCTION + void operator()(const Tag_normR&, const ordinal_t i, mag_t &normR) const + { + scalar_t normRi = localB (i, 0); + if (forward_sweep) { + // compute R(i) = B(i) - (L+D)(i,:)*X + for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { + if (column_view (k) <= i) { + normRi -= values_view (k) * localX (column_view (k), 0); + } + } + } else { + // compute R(i) = B(i) - (D+U)(i,:)*X + for (size_type k = rowmap_view (i); k < rowmap_view (i+1); k++) { + if (column_view (k) >= i && column_view (k) < num_rows) { + normRi -= values_view (k) * localX (column_view (k), 0); + } + } + } + normR += ST::abs (normRi * normRi); } }; // --------------------------------------------------------- // @@ -574,27 +630,44 @@ namespace KokkosSparse{ #endif auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); GSDirection direction = gsHandle->getSweepDirection (); using GS_Functor_t = TwostageGaussSeidel_functor; // count nnz in local L & U matrices (rowmap_viewL/rowmap_viewU stores offsets for each row) - ordinal_t nnzL = 0; - ordinal_t nnzU = 0; - row_map_view_t rowmap_viewL ("row_mapL", num_rows+1); - row_map_view_t rowmap_viewU ("row_mapU", num_rows+1); + ordinal_t nnzA = column_view.extent (0); + ordinal_t nnzL = 0; // lower-part of diagonal block + ordinal_t nnzU = 0; // upper-part of diagonal block + row_map_view_t rowmap_viewL ("row_mapL", num_rows+1); // lower-part of diagonal block + row_map_view_t rowmap_viewU ("row_mapU", num_rows+1); // upper-part of diagonal block + row_map_view_t rowmap_viewLa ("row_mapLa", num_rows+1); // complement of U+D + row_map_view_t rowmap_viewUa ("row_mapUa", num_rows+1); // complement of L+D if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_reduce ("nnzL", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewL), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewL, rowmap_viewUa), nnzL); } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_reduce ("nnzU", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewU), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewU, rowmap_viewLa), nnzU); } + ordinal_t nnzLa = 0; // complement of U+D + ordinal_t nnzUa = 0; // complement of L+D + if (compact_form) { + nnzLa = nnzA - nnzU; + nnzUa = nnzA - nnzL; + if (two_stage) { + // two-stage iterates with L or U (no D) + nnzLa -= num_rows; + nnzUa -= num_rows; + } + } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -605,10 +678,18 @@ namespace KokkosSparse{ if (direction == GS_FORWARD || direction == GS_SYMMETRIC) { KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum (1+num_rows, rowmap_viewL); + if (compact_form) { + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum + (1+num_rows, rowmap_viewLa); + } } if (direction == GS_BACKWARD || direction == GS_SYMMETRIC) { KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum (1+num_rows, rowmap_viewU); + if (compact_form) { + KokkosKernels::Impl::kk_inclusive_parallel_prefix_sum + (1+num_rows, rowmap_viewUa); + } } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); @@ -626,6 +707,14 @@ namespace KokkosSparse{ // allocate memory to store local U entries_view_t column_viewU (Kokkos::ViewAllocateWithoutInitializing("entriesU"), nnzU); values_view_t values_viewU (Kokkos::ViewAllocateWithoutInitializing("valuesU"), nnzU); + + // allocate memory to store complement of U+D + entries_view_t column_viewLa (Kokkos::ViewAllocateWithoutInitializing("entriesLa"), nnzLa); + values_view_t values_viewLa (Kokkos::ViewAllocateWithoutInitializing("valuesLa"), nnzLa); + + // allocate memory to store complement of L+D + entries_view_t column_viewUa (Kokkos::ViewAllocateWithoutInitializing("entriesUa"), nnzUa); + values_view_t values_viewUa (Kokkos::ViewAllocateWithoutInitializing("valuesUa"), nnzUa); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -636,10 +725,14 @@ namespace KokkosSparse{ { // extract local L & U structures (for computing (L+D)^{-1} or (D+U)^{-1}) using range_policy = Kokkos::RangePolicy ; - Kokkos::parallel_for ("entryLU", range_policy (0, num_rows), - GS_Functor_t (two_stage, num_rows, rowmap_view, column_view, - rowmap_viewL, column_viewL, - rowmap_viewU, column_viewU)); + Kokkos::parallel_for ("entriesLU", range_policy (0, num_rows), + GS_Functor_t (two_stage, compact_form, + num_rows, rowmap_view, column_view, + rowmap_viewL, column_viewL, + rowmap_viewU, column_viewU, + // + rowmap_viewLa, column_viewLa, + rowmap_viewUa, column_viewUa)); } #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); @@ -658,6 +751,22 @@ namespace KokkosSparse{ gsHandle->setL (crsmatL); gsHandle->setU (crsmatU); gsHandle->setD (viewD); + + if (compact_form) { + // construct complements + graph_t graphLa (column_viewLa, rowmap_viewLa); + graph_t graphUa (column_viewUa, rowmap_viewUa); + crsmat_t crsmatLa ("La", num_rows, values_viewLa, graphLa); + crsmat_t crsmatUa ("Ua", num_rows, values_viewUa, graphUa); + + // store them in handle + gsHandle->setLa (crsmatLa); + gsHandle->setUa (crsmatUa); + + values_view_t viewDa (Kokkos::ViewAllocateWithoutInitializing("diags"), num_rows); + gsHandle->setDa (viewDa); + } + if (!(gsHandle->isTwoStage ())) { // create SpTRSV handles for classical GS using namespace KokkosSparse::Experimental; @@ -685,9 +794,11 @@ namespace KokkosSparse{ auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); // load local D from handle auto viewD = gsHandle->getD (); + auto viewDa = gsHandle->getDa (); // load local L from handle auto crsmatL = gsHandle->getL (); @@ -701,13 +812,26 @@ namespace KokkosSparse{ auto rowmap_viewU = crsmatU.graph.row_map; auto column_viewU = crsmatU.graph.entries; + // load complement of U+D from handle + auto crsmatLa = gsHandle->getLa (); + auto values_viewLa = crsmatLa.values; + auto rowmap_viewLa = crsmatLa.graph.row_map; + + // load complement of L+D from handle + auto crsmatUa = gsHandle->getUa (); + auto values_viewUa = crsmatUa.values; + auto rowmap_viewUa = crsmatUa.graph.row_map; + + // extract local L, D & U matrices using range_policy = Kokkos::RangePolicy ; Kokkos::parallel_for ("valueLU", range_policy (0, num_rows), - GS_Functor_t (two_stage, diagos_given, num_rows, + GS_Functor_t (two_stage, compact_form, diagos_given, num_rows, rowmap_view, column_view, values_view, d_invert_view, - rowmap_viewL, column_viewL, values_viewL, viewD, - rowmap_viewU, column_viewU, values_viewU)); + rowmap_viewL, values_viewL, viewD, + rowmap_viewU, values_viewU, + rowmap_viewLa, values_viewLa, viewDa, + rowmap_viewUa, values_viewUa)); #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); @@ -742,7 +866,7 @@ namespace KokkosSparse{ y_value_array_type localB, // in bool init_zero_x_vector = false, int numIter = 1, - scalar_t omega = Kokkos::Details::ArithTraits::one(), + scalar_t omega = ST::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) @@ -759,6 +883,9 @@ namespace KokkosSparse{ // auto *gsHandle = get_gs_handle(); bool two_stage = gsHandle->isTwoStage (); + bool compact_form = gsHandle->isCompactForm (); + scalar_t gamma = gsHandle->getInnerDampFactor (); + GSDirection direction = gsHandle->getSweepDirection (); if (apply_forward && apply_backward) { direction = GS_SYMMETRIC; @@ -772,8 +899,11 @@ namespace KokkosSparse{ // load auxiliary matrices from handle auto localD = gsHandle->getD (); - auto crsmatL = gsHandle->getL (); - auto crsmatU = gsHandle->getU (); + auto crsmatL = gsHandle->getL (); // lower-part of diagonal block + auto crsmatU = gsHandle->getU (); // upper-part of diagonal block + auto localDa = gsHandle->getDa (); + auto crsmatLa = gsHandle->getLa (); // complement of L+D (used only for compact form) + auto crsmatUa = gsHandle->getUa (); // complement of U+D (used only for compact form) // wratp A into crsmat input_graph_t graphA (column_view, rowmap_view); @@ -781,21 +911,25 @@ namespace KokkosSparse{ #ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS Kokkos::fence(); tic = timer.seconds (); + std::cout << std::endl << "TWO-STAGE GS::APPLY with " << numIter << " outer GS sweeps with omega = " << omega + << ", and " << gsHandle->getNumInnerSweeps () << " inner JR sweeps, with gamma = " << gamma + << " (numRows=" << num_rows << ")" + << std::endl; std::cout << std::endl << "TWO-STAGE GS::APPLY::CREATE CRS_A TIME : " << tic << std::endl; timer.reset(); #endif // load auxiliary vectors - int nrows = num_rows; int nrhs = localX.extent (1); - gsHandle->initVectors (nrows, nrhs); + gsHandle->initVectors (num_rows, nrhs); auto localR = gsHandle->getVectorR (); auto localT = gsHandle->getVectorT (); auto localZ = gsHandle->getVectorZ (); // outer Gauss-Seidel iteration - int NumSweeps = numIter; + int NumOuterSweeps = gsHandle->getNumOuterSweeps (); int NumInnerSweeps = gsHandle->getNumInnerSweeps (); + int NumSweeps = (NumOuterSweeps > numIter ? NumOuterSweeps : numIter); if (direction == GS_SYMMETRIC) { NumSweeps *= 2; } @@ -803,103 +937,225 @@ namespace KokkosSparse{ KokkosKernels::Impl::zero_vector(nrhs, localX); } for (int sweep = 0; sweep < NumSweeps; ++sweep) { - // R = B - A*x + bool forward_sweep = (direction == GS_FORWARD || + (direction == GS_SYMMETRIC && sweep%2 == 0)); + // compute residual vector KokkosBlas::scal (localR, one, localB); if (sweep > 0 || !init_zero_x_vector) { - KokkosSparse:: - spmv ("N", scalar_t(-one), crsmatA, - localX, - one, localR); + if (compact_form) { + if (forward_sweep) { + // R = B - U*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatUa, + localX, + one, localR); + } else { + // R = B - L*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatLa, + localX, + one, localR); + } + if (omega != one) { + // R = B - (U + (1-1/omega)D)*x + scalar_t omega2 = (one/omega - one); + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + KokkosBlas::mult (zero, localZ, + one, localDa, localY); + KokkosBlas::axpy (omega2, localZ, localR); + } + } else { // not compact_form + // R = B - A*x + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, localR); +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1)); + single_vector_view_t Rj (localRj.data (), num_rows); + std::cout << "norm(GS)-" << sweep << " " << KokkosBlas::nrm2 (Rj) + << " (" << (forward_sweep ? "forward" : "backward" ) << ")" + << std::endl; + } +#endif + } } - if (!two_stage) { // ===== sparse-triangular solve ===== - if (direction == GS_FORWARD || - (direction == GS_SYMMETRIC && sweep%2 == 0)) { - // Z = (L+D)^{-1} * R + if (!two_stage) { + // ===== sparse-triangular solve ===== + // TODO: omega is not supported here + // (L + D is extracted in initialize_numeric, + // but (omega*L + D)^{-1} needs to be applied with omega passed into apply) + // hence, omega = one + if (omega != one) { + throw std::invalid_argument (" *** TwostageGaussSeidel::apply with omega != one is not supported with sptrsv ***\n"); + } + if (forward_sweep) { + // Z = (omega * L + D)^{-1} * R // NOTE: need to go over RHSs using namespace KokkosSparse::Experimental; for (int j = 0; j < nrhs; j++) { auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1)); auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1)); - single_vector_view_t Rj (localRj.data (), nrows); - single_vector_view_t Zj (localZj.data (), nrows); + single_vector_view_t Rj (localRj.data (), num_rows); + single_vector_view_t Zj (localZj.data (), num_rows); sptrsv_solve (handle->get_gs_sptrsvL_handle(), crsmatL.graph.row_map, crsmatL.graph.entries, crsmatL.values, Rj, Zj); } } else { using namespace KokkosSparse::Experimental; - // Z = (U+D)^{-1} * R + // Z = (omega * U + D)^{-1} * R // NOTE: need to go over RHSs for (int j = 0; j < nrhs; j++) { auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (j, j+1)); auto localZj = Kokkos::subview (localZ, Kokkos::ALL (), range_type (j, j+1)); - single_vector_view_t Rj (localRj.data (), nrows); - single_vector_view_t Zj (localZj.data (), nrows); + single_vector_view_t Rj (localRj.data (), num_rows); + single_vector_view_t Zj (localZj.data (), num_rows); sptrsv_solve (handle->get_gs_sptrsvU_handle(), crsmatU.graph.row_map, crsmatU.graph.entries, crsmatU.values, Rj, Zj); } } - } else { // ====== inner Jacobi-Richardson ===== + + // update solution (no omega) + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + if (compact_form) { + // Y = omega * Z + KokkosBlas::scal (localY, one, localZ); + } else { + // Y = Y + omega * Z + KokkosBlas::axpy (one, localZ, localY); + } + } else { + // ====== inner Jacobi-Richardson ===== +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + //compute initial residual norm + // > compute RHS for the inner loop, R = B - A*x + internal_vector_view_t tempR ("tempR", num_rows, 1); + KokkosBlas::scal (tempR, one, localB); + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, tempR); + // > initial vector for the inner loop is zero + Kokkos::deep_copy (localZ, zero); + using Norm_Functor_t = TwostageGaussSeidel_functor; + using range_policy = Kokkos::RangePolicy ; + { + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localZ, tempR), + normR); + std::cout << "> norm(JR)-" << 0 << " " << sqrt(normR) << std::endl; + } +#endif // compute starting vector: Z = D^{-1}*R (Z is correction, i.e., output of JR) - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (NumInnerSweeps == 0) { // this is Jacobi-Richardson X_{k+1} := X_{k} + D^{-1}(b-A*X_{k}) // copy to localZ (output of JR iteration) + + // row-scale: (D^{-1}*L)*Y = D^{-1}*B + // compute Z := D^{-1}*R KokkosBlas::mult (zero, localZ, one, localD, localR); + // apply inner damping factor, if not one + if (gamma != one) { + // Z = gamma * Z + KokkosBlas::scal (localZ, gamma, localZ); + } } else { // copy to localT (workspace used to save D^{-1}*R for JR iteration) KokkosBlas::mult (zero, localT, one, localD, localR); // initialize Jacobi-Richardson (using R as workspace for JR iteration) KokkosBlas::scal (localR, one, localT); + + // apply inner damping factor, if not one + if (gamma != one) { + // R = gamma * R + KokkosBlas::scal (localR, gamma, localR); + } + } +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // compute residual norm of the starting vector (D^{-1}R) + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localT, tempR), + normR); + std::cout << "> norm(JR)-" << 1 << " " << sqrt(normR) << std::endl; } - #else - KokkosBlas::mult (zero, localT, - one, localD, localR); - #endif +#endif // inner Jacobi-Richardson: for (int ii = 0; ii < NumInnerSweeps; ii++) { - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) // T = D^{-1}*R, and L = D^{-1}*L and U = D^{-1}*U // copy T into Z KokkosBlas::scal (localZ, one, localT); - #else - // Z = R - KokkosBlas::scal (localZ, one, localR); - #endif - if (direction == GS_FORWARD || - (direction == GS_SYMMETRIC && sweep%2 == 0)) { + if (forward_sweep) { // Z = Z - L*R KokkosSparse:: - spmv("N", scalar_t(-one), crsmatL, - localR, - one, localZ); + spmv("N", scalar_t(-omega), crsmatL, + localR, + one, localZ); } else { // Z = R - U*T KokkosSparse:: - spmv("N", scalar_t(-one), crsmatU, - localR, - one, localZ); + spmv("N", scalar_t(-omega), crsmatU, + localR, + one, localZ); + } + // apply inner damping factor, if not one + if (gamma != one) { + // Z = gamma * Z + KokkosBlas::scal (localZ, gamma, localZ); + // Z = Z + (one - one/gamma) * R + scalar_t gamma2 = one - gamma; + KokkosBlas::axpy (gamma2, localR, localZ); } - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) if (ii+1 < NumInnerSweeps) { // reinitialize (R to be Z) KokkosBlas::scal (localR, one, localZ); } - #else - // T = D^{-1}*Z - KokkosBlas::mult (zero, localT, - one, localD, localZ); - #endif +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // compute residual norm(r - (L+D)*y) + mag_t normR = zero; + Kokkos::parallel_reduce ("normR", range_policy (0, num_rows), + Norm_Functor_t (forward_sweep, num_rows, + rowmap_view, column_view, values_view, + localD, localZ, tempR), + normR); + std::cout << "> norm(JR)-" << 2+ii << " " << sqrt(normR) << std::endl; + } +#endif } // end of inner Jacobi Richardson - } - // Y = X + T - auto localY = Kokkos::subview (localX, range_type(0, nrows), Kokkos::ALL ()); - #if defined(KOKKOSSPARSE_IMPL_TWOSTAGE_GS_MERGE_SPMV) - KokkosBlas::axpy (one, localZ, localY); - #else - KokkosBlas::axpy (one, localT, localY); - #endif + + // update solution + auto localY = Kokkos::subview (localX, range_type(0, num_rows), Kokkos::ALL ()); + if (compact_form) { + // Y := omega * z + KokkosBlas::scal (localY, omega, localZ); + } else { + // Y := X + omega * Z + KokkosBlas::axpy (omega, localZ, localY); + } + } // end of inner GS sweep } // end of outer GS sweep +#ifdef KOKKOSSPARSE_IMPL_TIME_TWOSTAGE_GS + { + // R = B - A*x + KokkosBlas::scal (localR, one, localB); + KokkosSparse:: + spmv ("N", scalar_t(-one), crsmatA, + localX, + one, localR); + auto localRj = Kokkos::subview (localR, Kokkos::ALL (), range_type (0, 1)); + single_vector_view_t Rj (localRj.data (), num_rows); + std::cout << "norm(GS)-" << NumSweeps << " " << KokkosBlas::nrm2 (Rj) << std::endl; + } +#endif } }; } diff --git a/test_common/KokkosKernels_MatrixConverter.cpp b/test_common/KokkosKernels_MatrixConverter.cpp index 3f3fe11bae..41fb5ebc2c 100644 --- a/test_common/KokkosKernels_MatrixConverter.cpp +++ b/test_common/KokkosKernels_MatrixConverter.cpp @@ -53,8 +53,6 @@ int main (int argc, char* argv[]){ typedef int size_type; typedef int idx; typedef double wt; - - Kokkos::initialize(argc,argv); bool symmetrize = false, remove_diagonal = false, transpose = false; char *in_mtx = NULL, *out_bin = NULL; @@ -92,204 +90,208 @@ int main (int argc, char* argv[]){ exit(1); } - typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; - typedef typename KokkosSparse::CrsMatrix crstmat_t; - typedef typename crstmat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crstmat_t::values_type::non_const_type values_view_t; + Kokkos::initialize(argc,argv); + { + + typedef Kokkos::DefaultHostExecutionSpace MyExecSpace; + + typedef typename KokkosSparse::CrsMatrix crstmat_t; + typedef typename crstmat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crstmat_t::values_type::non_const_type values_view_t; - typedef typename graph_t::row_map_type::const_type c_row_map_view_t; - typedef typename graph_t::entries_type::const_type c_cols_view_t; - typedef typename crstmat_t::values_type::const_type c_values_view_t; + typedef typename graph_t::row_map_type::const_type c_row_map_view_t; + typedef typename graph_t::entries_type::const_type c_cols_view_t; + typedef typename crstmat_t::values_type::const_type c_values_view_t; - crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); + crstmat_t a_crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(in_mtx); - c_row_map_view_t orm = a_crsmat.graph.row_map; - c_cols_view_t oentries = a_crsmat.graph.entries; - c_values_view_t ovalues = a_crsmat.values; + c_row_map_view_t orm = a_crsmat.graph.row_map; + c_cols_view_t oentries = a_crsmat.graph.entries; + c_values_view_t ovalues = a_crsmat.values; - const size_type *prm = orm.data(); - const idx *pentries = oentries.data(); - const wt *pvals = ovalues.data(); + const size_type *prm = orm.data(); + const idx *pentries = oentries.data(); + const wt *pvals = ovalues.data(); - idx numrows = a_crsmat.numRows(); - //idx numcols = a_crsmat.numCols(); - idx nnz = ovalues.extent(0); - std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; - //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); + idx numrows = a_crsmat.numRows(); + //idx numcols = a_crsmat.numCols(); + idx nnz = ovalues.extent(0); + std::cout << "numrows :" << numrows << " nnz:" << nnz << std::endl; + //Kokkos::deep_copy(new_rowmap, a_crsmat.graph.row_map); - if (remove_diagonal) { - std::vector nrm(numrows + 1, 0); - std::vector nentries(nnz + 1); - std::vector nvals(nnz + 1); + if (remove_diagonal) { + std::vector nrm(numrows + 1, 0); + std::vector nentries(nnz + 1); + std::vector nvals(nnz + 1); - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - //wt val = pvals[j]; + size_type begin = prm[i]; + size_type end = prm[i+1]; + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + //wt val = pvals[j]; - if (i == col){ - nrm[i] = 1; - break; - } + if (i == col){ + nrm[i] = 1; + break; + } + } } - } - size_type prefix = 0; - for (idx i = 0; i <= numrows; ++i){ - size_type current = nrm[i]; - nrm[i] = prefix; - prefix += current; + size_type prefix = 0; + for (idx i = 0; i <= numrows; ++i){ + size_type current = nrm[i]; + nrm[i] = prefix; + prefix += current; - } + } - for (idx i = 0; i <= numrows; ++i){ - nrm[i] = prm[i] - nrm[i]; - } + for (idx i = 0; i <= numrows; ++i){ + nrm[i] = prm[i] - nrm[i]; + } - for (idx i = 0; i < numrows; ++i){ + for (idx i = 0; i < numrows; ++i){ - size_type begin = prm[i]; - size_type end = prm[i+1]; + size_type begin = prm[i]; + size_type end = prm[i+1]; - size_type obegin = nrm[i]; + size_type obegin = nrm[i]; - for (size_type j = begin; j < end; ++ j){ - idx col = pentries[j]; - wt val = pvals[j]; - if (i != col){ - nentries[obegin] = col; - nvals[obegin++] = val; - } - } - if (obegin != nrm[i+1]){ - std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; - exit(1); + for (size_type j = begin; j < end; ++ j){ + idx col = pentries[j]; + wt val = pvals[j]; + if (i != col){ + nentries[obegin] = col; + nvals[obegin++] = val; + } + } + if (obegin != nrm[i+1]){ + std::cout << "i:" << i << " nrm[i+1]:" << nrm[i+1] << " obegin:" << obegin << std::endl; + exit(1); + } } - } - row_map_view_t new_rowmap ("new rowmap", numrows + 1); + row_map_view_t new_rowmap ("new rowmap", numrows + 1); - cols_view_t new_entries("new colmap", nrm[numrows]); - values_view_t new_values("new values", nrm[numrows ]); + cols_view_t new_entries("new colmap", nrm[numrows]); + values_view_t new_values("new values", nrm[numrows ]); - for (idx i = 0; i <= numrows; ++i){ - new_rowmap(i) = nrm[i]; - } - - for (size_type i = 0; i < nrm[numrows ]; ++i){ - new_entries(i) = nentries[i]; - new_values(i) = nvals[i]; - } + for (idx i = 0; i <= numrows; ++i){ + new_rowmap(i) = nrm[i]; + } - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); - a_crsmat = transpose_matrix; + for (size_type i = 0; i < nrm[numrows ]; ++i){ + new_entries(i) = nentries[i]; + new_values(i) = nvals[i]; + } + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", numrows, new_values, transpose_graph); + a_crsmat = transpose_matrix; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); - if (symmetrize) { + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - row_map_view_t new_rowmap; - cols_view_t new_entries; + if (symmetrize) { - KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap - - (numrows, orm, oentries, new_rowmap, new_entries); - values_view_t new_values("new_values",new_entries.extent(0)); + row_map_view_t new_rowmap; + cols_view_t new_entries; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); + KokkosKernels::Impl::symmetrize_graph_symbolic_hashmap + + (numrows, orm, oentries, new_rowmap, new_entries); + values_view_t new_values("new_values",new_entries.extent(0)); - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); - graph_t symmetric_graph(new_entries, new_rowmap); - crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); - a_crsmat = symmetric_marix; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; + graph_t symmetric_graph(new_entries, new_rowmap); + crstmat_t symmetric_marix("transpose", numrows, new_values, symmetric_graph); + a_crsmat = symmetric_marix; - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } - if (transpose) { - row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); - cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); - values_view_t new_values ("new_rowmap", a_crsmat.nnz()); - - KokkosKernels::Impl::transpose_matrix< - c_row_map_view_t, c_cols_view_t, c_values_view_t, - row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( - a_crsmat.numRows(), a_crsmat.numCols(), - a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, - new_rowmap, new_entries, new_values); - - std::cout << 1 << std::endl; - cols_view_t out_adj ("out_adj", new_entries.extent(0)); - values_view_t out_vals("out_vals", new_entries.extent(0)); - std::cout << 2 << std::endl; - KokkosKernels::Impl::kk_sort_graph - (new_rowmap, new_entries, new_values, out_adj, out_vals); - new_entries = out_adj; - new_values = out_vals; - std::cout << 3 << std::endl; - MyExecSpace().fence(); - KokkosKernels::Impl::kk_print_1Dview(out_adj); - KokkosKernels::Impl::kk_print_1Dview(out_vals); - - graph_t transpose_graph(new_entries, new_rowmap); - crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); - a_crsmat = transpose_matrix; - - orm = a_crsmat.graph.row_map; - oentries = a_crsmat.graph.entries; - ovalues = a_crsmat.values; - - prm = orm.data(); - pentries = oentries.data(); - pvals = ovalues.data(); - - numrows = a_crsmat.numRows(); - //numcols = a_crsmat.numCols(); - nnz = ovalues.extent(0); - } + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } + if (transpose) { + row_map_view_t new_rowmap ("new_rowmap", a_crsmat.numCols() + 1); + cols_view_t new_entries ("new_rowmap", a_crsmat.nnz()); + values_view_t new_values ("new_rowmap", a_crsmat.nnz()); + + KokkosKernels::Impl::transpose_matrix< + c_row_map_view_t, c_cols_view_t, c_values_view_t, + row_map_view_t, cols_view_t, values_view_t, row_map_view_t, MyExecSpace>( + a_crsmat.numRows(), a_crsmat.numCols(), + a_crsmat.graph.row_map, a_crsmat.graph.entries, a_crsmat.values, + new_rowmap, new_entries, new_values); + + std::cout << 1 << std::endl; + cols_view_t out_adj ("out_adj", new_entries.extent(0)); + values_view_t out_vals("out_vals", new_entries.extent(0)); + std::cout << 2 << std::endl; + KokkosKernels::Impl::kk_sort_graph + (new_rowmap, new_entries, new_values, out_adj, out_vals); + new_entries = out_adj; + new_values = out_vals; + std::cout << 3 << std::endl; + MyExecSpace().fence(); + KokkosKernels::Impl::kk_print_1Dview(out_adj); + KokkosKernels::Impl::kk_print_1Dview(out_vals); + + graph_t transpose_graph(new_entries, new_rowmap); + crstmat_t transpose_matrix("transpose", a_crsmat.numRows(), new_values, transpose_graph); + a_crsmat = transpose_matrix; + + orm = a_crsmat.graph.row_map; + oentries = a_crsmat.graph.entries; + ovalues = a_crsmat.values; + + prm = orm.data(); + pentries = oentries.data(); + pvals = ovalues.data(); + + numrows = a_crsmat.numRows(); + //numcols = a_crsmat.numCols(); + nnz = ovalues.extent(0); + } - KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + KokkosKernels::Impl::write_kokkos_crst_matrix (a_crsmat, out_bin); + } Kokkos::finalize(); diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index bf86768d16..f3a34ba123 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -46,6 +46,8 @@ #define KOKKOSKERNELS_TEST_UTILS_HPP #include "KokkosKernels_Utils.hpp" +#include "Kokkos_ArithTraits.hpp" + namespace Test { template::value> struct multivector_layout_adapter; @@ -83,16 +85,15 @@ namespace Test { template void EXPECT_NEAR_KK(Scalar1 val1, Scalar2 val2, Scalar3 tol) { typedef Kokkos::Details::ArithTraits AT1; - typedef Kokkos::Details::ArithTraits AT2; typedef Kokkos::Details::ArithTraits AT3; - EXPECT_NEAR(double(AT1::abs(val1)),double(AT2::abs(val2)),double(AT3::abs(tol))); + EXPECT_LE((double) AT1::abs(val1 - val2), (double) AT3::abs(tol)); } template void EXPECT_NEAR_KK_1DVIEW(ViewType1 v1, ViewType2 v2, Scalar tol) { size_t v1_size = v1.extent(0); size_t v2_size = v2.extent(0); - EXPECT_NEAR_KK(v1_size, v2_size, 0); + EXPECT_EQ(v1_size, v2_size); typename ViewType1::HostMirror h_v1 = Kokkos::create_mirror_view(v1); @@ -121,6 +122,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef typename ViewTypeB::value_type ScalarB; typedef typename ViewTypeC::value_type ScalarC; + typedef Kokkos::View SubviewTypeA; + typedef Kokkos::View SubviewTypeB; typedef Kokkos::Details::ArithTraits APT; typedef typename APT::mag_type mag_type; ScalarA alpha; @@ -130,11 +133,19 @@ namespace Test { void operator() (const typename Kokkos::TeamPolicy::member_type& team) const { Kokkos::parallel_for(Kokkos::TeamThreadRange(team,C_rows), [&] (const int& i) { // Give each kokkos thread a vector of A - auto a_vec = A_t ? Kokkos::subview(A, Kokkos::ALL(), i) : Kokkos::subview(A, i, Kokkos::ALL()); + SubviewTypeA a_vec; + if(A_t) + a_vec = Kokkos::subview(A, Kokkos::ALL(), i); + else + a_vec = Kokkos::subview(A, i, Kokkos::ALL()); // Have all vector lanes perform the dot product Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,C_cols), [&] (const int& j) { - auto b_vec = B_t ? Kokkos::subview(B, j, Kokkos::ALL()) : Kokkos::subview(B, Kokkos::ALL(), j); + SubviewTypeB b_vec; + if(B_t) + b_vec = Kokkos::subview(B, j, Kokkos::ALL()); + else + b_vec = Kokkos::subview(B, Kokkos::ALL(), j); ScalarC ab = ScalarC(0); for (int k = 0; k < A_cols; k++) { auto a = A_c ? APT::conj(a_vec(k)) : a_vec(k); @@ -149,7 +160,7 @@ namespace Test { // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) template struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c; + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; ViewTypeA A; ViewTypeB B; ViewTypeC C; @@ -157,25 +168,35 @@ namespace Test { using ScalarA = typename ViewTypeA::value_type; using ScalarB = typename ViewTypeB::value_type; using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + ScalarA alpha; ScalarC beta; KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; - auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); - using SubviewTypeA = decltype(_A); - using SubviewTypeB = decltype(_B); - using SubviewTypeC = decltype(_C); + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + } struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; - vgemm.C_rows = C.extent(1); - vgemm.C_cols = C.extent(2); - vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t?A.extent(0):A.extent(1)) : (A_t?A.extent(1):A.extent(2)); vgemm.A = _A; vgemm.B = _B; vgemm.C = _C; @@ -188,9 +209,48 @@ namespace Test { void run() { Kokkos::parallel_for( "Test::VanillaGEMM", - Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16), *this); } }; + + template + class epsilon { + public: + constexpr static double value = std::numeric_limits::epsilon(); + }; + + // explicit epsilon specializations + #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + template<> + class epsilon { + public: + constexpr static double value = 0.0009765625F; + }; + #endif // KOKKOS_HALF_T_IS_FLOAT + + //Get the interval for Kokkos::fill_random + //For real, interval is (-mag, mag) + //For complex, both real and imaginary parts will have interval (-mag, mag) + template + inline void getRandomBounds(double mag, Scalar& start, Scalar& end) + { + start = -mag * Kokkos::ArithTraits::one(); + end = mag * Kokkos::ArithTraits::one(); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } + + template<> + inline void getRandomBounds(double mag, Kokkos::complex& start, Kokkos::complex& end) + { + start = Kokkos::complex(-mag, -mag); + end = Kokkos::complex(mag, mag); + } } #endif diff --git a/test_common/Test_Common_ArithTraits.hpp b/test_common/Test_Common_ArithTraits.hpp index bba54ff6f0..4fab021e66 100644 --- a/test_common/Test_Common_ArithTraits.hpp +++ b/test_common/Test_Common_ArithTraits.hpp @@ -1634,7 +1634,8 @@ int runAllArithTraitsHostTests (std::ostream& out, const int verbose) success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); -#if !defined( KOKKOS_ENABLE_CUDA ) && !defined( KOKKOS_ENABLE_HIP ) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) // This would spill tons of warnings about host device stuff otherwise success = success && curSuccess; curSuccess = testArithTraitsOnHost (out, verbose); success = success && curSuccess; curSuccess = testArithTraitsOnHost, DeviceType> (out, verbose); diff --git a/test_common/Test_Common_Sorting.hpp b/test_common/Test_Common_Sorting.hpp index 3a98c1f420..732ee4b451 100644 --- a/test_common/Test_Common_Sorting.hpp +++ b/test_common/Test_Common_Sorting.hpp @@ -544,7 +544,7 @@ void testBitonicSortLexicographic() } template -void testSortCRS(default_lno_t numRows, default_size_type nnz, bool doValues) +void testSortCRS(default_lno_t numRows, default_lno_t numCols, default_size_type nnz, bool doValues) { using scalar_t = default_scalar; using lno_t = default_lno_t; @@ -559,7 +559,7 @@ void testSortCRS(default_lno_t numRows, default_size_type nnz, bool doValues) //IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this //wouldn't test anything crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix - (numRows, numRows, nnz, 2, numRows / 2); + (numRows, numCols, nnz, 2, numCols / 2); auto rowmap = A.graph.row_map; auto entries = A.graph.entries; auto values = A.values; @@ -774,15 +774,20 @@ TEST_F( TestCategory, common_device_bitonic) { } TEST_F( TestCategory, common_sort_crsgraph) { - testSortCRS(10, 20, false); - testSortCRS(100, 2000, false); - testSortCRS(1000, 30000, false); + testSortCRS(10, 10, 20, false); + testSortCRS(100, 100, 2000, false); + testSortCRS(1000, 1000, 30000, false); } TEST_F( TestCategory, common_sort_crsmatrix) { - testSortCRS(10, 20, true); - testSortCRS(100, 2000, true); - testSortCRS(1000, 30000, true); + testSortCRS(10, 10, 20, true); + testSortCRS(100, 100, 2000, true); + testSortCRS(1000, 1000, 30000, true); +} + +TEST_F( TestCategory, common_sort_crs_longrows) { + testSortCRS(1, 50000, 10000, false); + testSortCRS(1, 50000, 10000, true); } TEST_F( TestCategory, common_sort_merge_crsmatrix) { diff --git a/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp b/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp index 6ee4ebef18..b63998b75f 100644 --- a/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp +++ b/unit_test/batched/Test_Batched_SerialEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -111,3 +112,4 @@ int test_batched_serial_eigendecomposition() { return 0; } +*/ diff --git a/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp b/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp index 7108f56bbb..344438e719 100644 --- a/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp +++ b/unit_test/batched/Test_Batched_SerialEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_float ) { test_batched_serial_eigendecomposition(); @@ -9,5 +10,5 @@ TEST_F( TestCategory, batched_scalar_serial_eigendecomposition_double ) { test_batched_serial_eigendecomposition(); } #endif - +*/ diff --git a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp index 9dd2a6b048..a02c701acd 100644 --- a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition.hpp @@ -1,5 +1,6 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) +/* #include "gtest/gtest.h" #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" @@ -114,3 +115,4 @@ int test_batched_teamvector_eigendecomposition() { return 0; } +*/ diff --git a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp index 14b3c61f4d..b4646c3027 100644 --- a/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp +++ b/unit_test/batched/Test_Batched_TeamVectorEigendecomposition_Real.hpp @@ -1,3 +1,4 @@ +/* #if defined(KOKKOSKERNELS_INST_FLOAT) TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_float ) { test_batched_teamvector_eigendecomposition(); @@ -9,5 +10,4 @@ TEST_F( TestCategory, batched_scalar_teamvector_eigendecomposition_double ) { test_batched_teamvector_eigendecomposition(); } #endif - - +*/ diff --git a/unit_test/blas/Test_Blas1_abs.hpp b/unit_test/blas/Test_Blas1_abs.hpp index acdb167d1d..d1cb36d368 100644 --- a/unit_test/blas/Test_Blas1_abs.hpp +++ b/unit_test/blas/Test_Blas1_abs.hpp @@ -2,7 +2,6 @@ #include #include #include -#include #include namespace Test { @@ -23,7 +22,7 @@ namespace Test { Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; - double eps = std::is_same::value?2*1e-5:1e-7; + typename AT::mag_type eps = AT::epsilon()*10; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -42,29 +41,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); + typename AT::mag_type eps = AT::epsilon()*10; + //Test and verify non-const input KokkosBlas::abs(y,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k AT; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; @@ -36,7 +39,13 @@ namespace Test { typename AT::mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } typename AT::mag_type nonconst_result = KokkosBlas::asum(a); EXPECT_NEAR_KK( nonconst_result, expected_result, eps*expected_result); diff --git a/unit_test/blas/Test_Blas1_axpby.hpp b/unit_test/blas/Test_Blas1_axpby.hpp index f2bc692d09..84943b1bc7 100644 --- a/unit_test/blas/Test_Blas1_axpby.hpp +++ b/unit_test/blas/Test_Blas1_axpby.hpp @@ -31,6 +31,7 @@ namespace Test { BaseTypeB b_org_y("Org_Y",N); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); typename ViewTypeA::const_type c_x = x; @@ -44,26 +45,38 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + Kokkos::deep_copy(h_b_org_y, b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - - ScalarA expected_result = 0; - for(int i=0;i @@ -93,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -105,36 +127,32 @@ namespace Test { ScalarB b = 5; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; Kokkos::View r("Dot::Result",K); - typedef Kokkos::Details::ArithTraits AT; - KokkosBlas::axpby(a,x,b,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k::value, Kokkos::LayoutRight, Kokkos::LayoutLeft>::type,Device> BaseTypeB; + using MagnitudeA = typename Kokkos::ArithTraits::mag_type; ScalarA a = 3; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?2e-5:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -43,26 +43,40 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result = 0; + KokkosBlas::axpy(a,x,y); + Kokkos::deep_copy(h_b_y, b_y); + for(int i=0;i @@ -92,10 +106,19 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -103,33 +126,28 @@ namespace Test { ScalarA a = 3; typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::axpy(a,x,y); - KokkosBlas::dot(r,y,y); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -92,10 +98,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_b,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp index 166c25c1a8..5e98912553 100644 --- a/unit_test/blas/Test_Blas1_iamax.hpp +++ b/unit_test/blas/Test_Blas1_iamax.hpp @@ -29,9 +29,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -115,9 +115,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_mult.hpp b/unit_test/blas/Test_Blas1_mult.hpp index fcab767dcc..1f6856a934 100644 --- a/unit_test/blas/Test_Blas1_mult.hpp +++ b/unit_test/blas/Test_Blas1_mult.hpp @@ -29,7 +29,7 @@ namespace Test { ScalarA a = 3; ScalarB b = 5; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = std::is_same::value?1e-4:1e-7; BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); @@ -53,33 +53,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -118,11 +137,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -133,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + double eps = std::is_same::value?1e-4:1e-7; KokkosBlas::mult(b,z,a,x,y); - KokkosBlas::dot(r,z,z); - for(int k=0;k AT; + typedef Kokkos::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef Kokkos::View rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - double eps = std::is_same::value?2*1e-5:1e-7; + double eps = (std::is_same::mag_type, float>::value ? 1e-4 : 1e-7); - typename AT::mag_type expected_result = 0; + mag_type expected_result = 0; for(int i=0;i::imag is 0 if T is real. + expected_result += MAT::abs(AT::real(h_a(i))) + MAT::abs(AT::imag(h_a(i))); + } - typename AT::mag_type const_result = KokkosBlas::nrm1(c_a); - EXPECT_NEAR_KK( const_result, expected_result, eps*expected_result); + mag_type nonconst_result = KokkosBlas::nrm1(a); + EXPECT_NEAR_KK( nonconst_result, expected_result, eps * expected_result ); + mag_type const_result = KokkosBlas::nrm1(c_a); + EXPECT_NEAR_KK( const_result, expected_result, eps * expected_result ); } template @@ -53,6 +61,8 @@ namespace Test { typedef typename ViewTypeA::value_type ScalarA; typedef Kokkos::Details::ArithTraits AT; + typedef typename AT::mag_type mag_type; + typedef Kokkos::ArithTraits MAT; typedef multivector_layout_adapter vfA_type; @@ -68,38 +78,36 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; - typename AT::mag_type* expected_result = new typename AT::mag_type[K]; - for(int j=0;j::mag_type, float>::value ? 1e-4 : 1e-7); + + Kokkos::View expected_result("Expected Nrm1", K); + for(int k = 0; k < K; k++) + { + expected_result(k) = MAT::zero(); for(int i=0;i::value?2*1e-5:1e-7; - - Kokkos::View r("Dot::Result",K); + Kokkos::View r("Nrm1::Result",K); + Kokkos::View c_r("Nrm1::ConstResult",K); - KokkosBlas::nrm1(r,a); - for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -69,9 +69,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index ac116b8987..aef2e2e95e 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -68,9 +68,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(1)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); diff --git a/unit_test/blas/Test_Blas1_nrminf.hpp b/unit_test/blas/Test_Blas1_nrminf.hpp index f328a720b7..0893045dee 100644 --- a/unit_test/blas/Test_Blas1_nrminf.hpp +++ b/unit_test/blas/Test_Blas1_nrminf.hpp @@ -27,9 +27,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -70,9 +70,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -98,13 +98,12 @@ namespace Test { EXPECT_NEAR_KK( nonconst_result, exp_result, eps*exp_result); } - /* KokkosBlas::nrminf(r,c_a); + KokkosBlas::nrminf(r,c_a); for(int k=0;k rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); @@ -99,10 +105,16 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); diff --git a/unit_test/blas/Test_Blas1_scal.hpp b/unit_test/blas/Test_Blas1_scal.hpp index f59b8d49ea..254850f1ae 100644 --- a/unit_test/blas/Test_Blas1_scal.hpp +++ b/unit_test/blas/Test_Blas1_scal.hpp @@ -25,13 +25,10 @@ namespace Test { ScalarA a(3); typename AT::mag_type eps = AT::epsilon()*1000; - typename AT::mag_type zero = AT::abs( AT::zero() ); - typename AT::mag_type one = AT::abs( AT::one() ); BaseTypeA b_x("X",N); BaseTypeB b_y("Y",N); BaseTypeB b_org_y("Org_Y",N); - ViewTypeA x = Kokkos::subview(b_x,Kokkos::ALL(),0); ViewTypeB y = Kokkos::subview(b_y,Kokkos::ALL(),0); @@ -46,35 +43,35 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); - ScalarA expected_result(0); - for(int i=0;i rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(1.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } Kokkos::fence(); Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); - Kokkos::deep_copy(h_b_y,b_y); ScalarA a(3.0); typename ViewTypeA::const_type c_x = x; - ScalarA* expected_result = new ScalarA[K]; - for(int j=0;j r("Dot::Result",K); KokkosBlas::scal(y,a,x); - KokkosBlas::dot(r,y,y); - for(int k=0;k params("Params",K); for(int j=0; j rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -51,7 +51,6 @@ namespace Test { void impl_test_sum_mv(int N, int K) { typedef typename ViewTypeA::value_type ScalarA; - typedef Kokkos::Details::ArithTraits AT; typedef multivector_layout_adapter vfA_type; @@ -67,9 +66,9 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - - Kokkos::fence(); + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_a,rand_pool,randStart,randEnd); Kokkos::deep_copy(h_b_a,b_a); @@ -79,7 +78,7 @@ namespace Test { for(int j=0;j::value?2*1e-5:1e-7; diff --git a/unit_test/blas/Test_Blas1_team_dot.hpp b/unit_test/blas/Test_Blas1_team_dot.hpp index 158dcf5733..f3c819da3b 100644 --- a/unit_test/blas/Test_Blas1_team_dot.hpp +++ b/unit_test/blas/Test_Blas1_team_dot.hpp @@ -46,8 +46,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); @@ -150,8 +148,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); Kokkos::fill_random(b_b,rand_pool,ScalarB(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); Kokkos::deep_copy(h_b_b,b_b); diff --git a/unit_test/blas/Test_Blas1_team_nrm2.hpp b/unit_test/blas/Test_Blas1_team_nrm2.hpp index 4c654c7eae..99147053ed 100644 --- a/unit_test/blas/Test_Blas1_team_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_team_nrm2.hpp @@ -33,8 +33,6 @@ namespace Test { Kokkos::fill_random(b_a,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(h_b_a,b_a); typename ViewTypeA::const_type c_a = a; diff --git a/unit_test/blas/Test_Blas1_team_scal.hpp b/unit_test/blas/Test_Blas1_team_scal.hpp index 6b33caa262..fb6ef4487d 100644 --- a/unit_test/blas/Test_Blas1_team_scal.hpp +++ b/unit_test/blas/Test_Blas1_team_scal.hpp @@ -57,8 +57,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); @@ -132,8 +130,6 @@ namespace Test { Kokkos::fill_random(b_x,rand_pool,ScalarA(1)); Kokkos::fill_random(b_y,rand_pool,ScalarB(1)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas1_team_update.hpp b/unit_test/blas/Test_Blas1_team_update.hpp index dcc9d1e486..5298a6798d 100644 --- a/unit_test/blas/Test_Blas1_team_update.hpp +++ b/unit_test/blas/Test_Blas1_team_update.hpp @@ -66,8 +66,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); @@ -149,8 +147,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_z,b_z); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas1_update.hpp b/unit_test/blas/Test_Blas1_update.hpp index 8bfcdbe5cc..0ece3ae74c 100644 --- a/unit_test/blas/Test_Blas1_update.hpp +++ b/unit_test/blas/Test_Blas1_update.hpp @@ -54,35 +54,52 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); + auto h_org_z = Kokkos::subview(h_b_org_z, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_z,b_z); - ScalarA expected_result = 0; - for(int i=0;i @@ -119,13 +136,24 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarA(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarB(10)); - Kokkos::fill_random(b_z,rand_pool,ScalarC(10)); - - Kokkos::fence(); + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarB randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarC randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_z,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_z,b_z); + auto h_b_org_z = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_z); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); @@ -137,33 +165,28 @@ namespace Test { typename ViewTypeA::const_type c_x = x; typename ViewTypeB::const_type c_y = y; - ScalarC* expected_result = new ScalarC[K]; - for(int j=0;j::value?2*1e-5:1e-7; - Kokkos::View r("Dot::Result",K); - KokkosBlas::update(a,x,b,y,c,z); - KokkosBlas::dot(r,z,z); - for(int k=0;k::value ? 2*1e-5 : 1e-7); + double eps = (std::is_same::mag_type, float>::value ? 1e-3 : 1e-10); int ldx; int ldy; @@ -61,59 +61,80 @@ namespace Test { Kokkos::Random_XorShift64_Pool rand_pool(13718); - Kokkos::fill_random(b_x,rand_pool,ScalarX(10)); - Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); - Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - - Kokkos::fence(); + { + ScalarX randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_x,rand_pool,randStart,randEnd); + } + { + ScalarY randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_y,rand_pool,randStart,randEnd); + } + { + ScalarA randStart, randEnd; + Test::getRandomBounds(10.0, randStart, randEnd); + Kokkos::fill_random(b_A,rand_pool,randStart,randEnd); + } Kokkos::deep_copy(b_org_y,b_y); + auto h_b_org_y = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), b_org_y); + auto h_org_y = Kokkos::subview(h_b_org_y, Kokkos::ALL(), 0); Kokkos::deep_copy(h_b_x,b_x); Kokkos::deep_copy(h_b_y,b_y); Kokkos::deep_copy(h_b_A,b_A); typedef Kokkos::Details::ArithTraits KAT; - ScalarY expected_result = KAT:: zero(); + Kokkos::View expected("expected aAx+by", ldy); if(mode[0] == 'N') { for(int i = 0; i < M; i++) { - ScalarY y_i = KAT::zero (); + ScalarY y_i = beta * h_org_y(i); for(int j = 0; j < N; j++) { - y_i += h_A(i,j) * h_x(j); + y_i += alpha * h_A(i,j) * h_x(j); } - expected_result += (beta * h_y(i) + alpha * y_i) * (beta * h_y(i) + alpha * y_i) ; + expected(i) = y_i; } } else if(mode[0] == 'T') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += h_A(i,j) * h_x(i); + y_j += alpha * h_A(i,j) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } else if(mode[0] == 'C') { for(int j = 0; j < N; j++) { - ScalarY y_j = KAT::zero (); + ScalarY y_j = beta * h_org_y(j); for(int i = 0; i < M; i++) { - y_j += KAT::conj (h_A(i,j)) * h_x(i); + y_j += alpha * KAT::conj (h_A(i,j)) * h_x(i); } - expected_result += (beta * h_y(j) + alpha * y_j) * (beta * h_y(j) + alpha * y_j) ; + expected(j) = y_j; } } KokkosBlas::gemv(mode, alpha, A, x, beta, y); - ScalarY nonconst_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( nonconst_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i)); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y); - ScalarY const_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_nonconst_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } Kokkos::deep_copy(b_y, b_org_y); KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y); - ScalarY const_const_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK( const_const_result, expected_result, eps*expected_result); + Kokkos::deep_copy(h_b_y, b_y); + for(int i = 0; i < ldy; i++) + { + EXPECT_NEAR_KK(expected(i), h_y(i), eps); + } } } @@ -203,7 +224,7 @@ TEST_F( TestCategory, gemv_complex_double ) { Kokkos::Profiling::popRegion(); Kokkos::Profiling::pushRegion("KokkosBlas::Test::gemv_conj_complex_double"); - test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("T"); + test_gemv,Kokkos::complex,Kokkos::complex,TestExecSpace> ("C"); Kokkos::Profiling::popRegion(); } #endif diff --git a/unit_test/blas/Test_Blas2_team_gemv.hpp b/unit_test/blas/Test_Blas2_team_gemv.hpp index 124941bfd8..f8a7f7c1be 100644 --- a/unit_test/blas/Test_Blas2_team_gemv.hpp +++ b/unit_test/blas/Test_Blas2_team_gemv.hpp @@ -64,8 +64,6 @@ namespace Test { Kokkos::fill_random(b_y,rand_pool,ScalarY(10)); Kokkos::fill_random(b_A,rand_pool,ScalarA(10)); - Kokkos::fence(); - Kokkos::deep_copy(b_org_y,b_y); Kokkos::deep_copy(h_b_x,b_x); diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 451b7fedac..580de25397 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -115,8 +115,6 @@ namespace Test { Kokkos::deep_copy(C2,C); - Kokkos::fence(); - struct VanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; @@ -130,8 +128,6 @@ namespace Test { KokkosBlas::gemm(TA,TB,alpha,A,B,beta,C); - Kokkos::fence(); - mag_type diff_C = 0; struct DiffGEMM diffgemm; diffgemm.N = N; diff --git a/unit_test/blas/Test_Blas3_trmm.hpp b/unit_test/blas/Test_Blas3_trmm.hpp index 9f72bd5e63..4c8d154c15 100644 --- a/unit_test/blas/Test_Blas3_trmm.hpp +++ b/unit_test/blas/Test_Blas3_trmm.hpp @@ -121,7 +121,6 @@ namespace Test { Kokkos::parallel_for("KokkosBlas::Test::NonUnitDiagTRMM", Kokkos::RangePolicy(0,K), nudtrmm); } Kokkos::fill_random(B, rand_pool, Kokkos::rand, ScalarA>::max()); - Kokkos::fence(); Kokkos::deep_copy(host_A, A); // Make host_A a lower triangle @@ -162,11 +161,9 @@ namespace Test { vgemm.beta = beta; Kokkos::parallel_for("KokkosBlas::Test::VanillaGEMM", Kokkos::TeamPolicy(M,Kokkos::AUTO,16), vgemm); } - Kokkos::fence(); Kokkos::deep_copy(host_B_expected, B_expected); KokkosBlas::trmm(side, uplo, trans, diag, alpha, A, B); - Kokkos::fence(); Kokkos::deep_copy(host_B_actual, B); bool test_flag = true; diff --git a/unit_test/blas/Test_Blas3_trsm.hpp b/unit_test/blas/Test_Blas3_trsm.hpp index 8fec44b637..ca9c40ae7e 100644 --- a/unit_test/blas/Test_Blas3_trsm.hpp +++ b/unit_test/blas/Test_Blas3_trsm.hpp @@ -127,8 +127,6 @@ namespace Test { ScalarA alpha_trmm = ScalarA(1)/alpha; ScalarA beta = ScalarA(0); - Kokkos::fence(); - if ((uplo[0]=='L')||(uplo[0]=='l')) { for (int i = 0; i < K-1; i++) for (int j = i+1; j < K; j++) diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index 30d32fb2dc..4080a17f80 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -47,6 +47,7 @@ #include #include "KokkosGraph_MIS2.hpp" +#include "KokkosGraph_ExplicitCoarsening.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_SparseUtils.hpp" @@ -194,9 +195,73 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t //Check that every label is in the range [0, numClusters) for(lno_t i = 0; i < numVerts; i++) EXPECT_TRUE(0 <= labelsHost(i) && labelsHost(i) < numClusters); + //Test explicit coarsening given the labels, with and without compressing the result + rowmap_t coarseRowmapNC, coarseRowmapC; + entries_t coarseEntriesNC, coarseEntriesC; + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapNC, coarseEntriesNC, false); + KokkosGraph::Experimental::graph_explicit_coarsen + (symRowmap, symEntries, labels, numClusters, coarseRowmapC, coarseEntriesC, true); + EXPECT_EQ(coarseRowmapC.extent(0), numClusters + 1); + EXPECT_EQ(coarseRowmapNC.extent(0), numClusters + 1); + //Check that coarse graph doesn't have more edges than fine graph + EXPECT_LE(coarseEntriesC.extent(0), symEntries.extent(0)); + EXPECT_LE(coarseEntriesNC.extent(0), symEntries.extent(0)); + //Verify compression is working. + auto hostRowmapNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapNC); + auto hostEntriesNC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesNC); + auto hostRowmapC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseRowmapC); + auto hostEntriesC = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), coarseEntriesC); + for(lno_t i = 0; i < numClusters; i++) + { + //std::set maintains uniqueness as well as ascending order of elements. + //So it should exactly match the entries in the compressed version. + std::set uniqueEntries; + for(size_type j = hostRowmapNC(i); j < hostRowmapNC(i + 1); j++) + { + uniqueEntries.insert(hostEntriesNC(j)); + } + size_type compressedRowLen = hostRowmapC(i + 1) - hostRowmapC(i); + ASSERT_EQ(uniqueEntries.size(), compressedRowLen); + auto it = uniqueEntries.begin(); + for(size_type j = hostRowmapC(i); j < hostRowmapC(i + 1); j++) + { + EXPECT_EQ(*it, hostEntriesC(j)); + it++; + } + } } } +template +void test_mis2_coarsening_zero_rows() +{ + using crsMat = KokkosSparse::CrsMatrix; + using graph_type = typename crsMat::StaticCrsGraphType; + using c_rowmap_t = typename graph_type::row_map_type; + using c_entries_t = typename graph_type::entries_type; + using rowmap_t = typename c_rowmap_t::non_const_type; + using entries_t = typename c_entries_t::non_const_type; + rowmap_t fineRowmap; + entries_t fineEntries; + //note: MIS2 coarsening first calls MIS2 on the fine graph, so this covers the zero-row case for MIS2 alone. + lno_t numClusters; + auto labels = graph_mis2_coarsen(fineRowmap, fineEntries, numClusters, KokkosGraph::MIS2_FAST); + EXPECT_EQ(numClusters, 0); + EXPECT_EQ(labels.extent(0), 0); + //coarsen, should also produce a graph with 0 rows/entries + rowmap_t coarseRowmap; + entries_t coarseEntries; + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, false); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); + KokkosGraph::Experimental::graph_explicit_coarsen + (fineRowmap, fineEntries, labels, 0, coarseRowmap, coarseEntries, true); + EXPECT_LE(coarseRowmap.extent(0), 1); + EXPECT_EQ(coarseEntries.extent(0), 0); +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, graph##_##graph_mis2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ @@ -206,9 +271,11 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, lno_t } \ TEST_F(TestCategory, graph##_##graph_mis2_coarsening##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) \ { \ + test_mis2_coarsening(5000, 5000 * 200, 2000, 10); \ test_mis2_coarsening(5000, 5000 * 20, 1000, 10); \ test_mis2_coarsening(50, 50 * 10, 40, 10); \ test_mis2_coarsening(5, 5 * 3, 5, 0); \ + test_mis2_coarsening_zero_rows(); \ } #if defined(KOKKOSKERNELS_INST_DOUBLE) diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 13513fef14..85b427d445 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -189,9 +189,46 @@ testCrsMatrix () //printf ("A is %d by %d\n", A.numRows (), A.numCols ()); } +template +void +testCrsMatrixHostMirror () +{ + using namespace Test; + using crs_matrix = KokkosSparse::CrsMatrix; + using crs_matrix_host = typename crs_matrix::HostMirror; + using crs_graph = typename crs_matrix::StaticCrsGraphType; + using crs_graph_host = typename crs_graph::HostMirror; + crs_matrix A = makeCrsMatrix(); + typename crs_matrix::values_type::HostMirror valuesHost("values host", A.nnz()); + typename crs_matrix::row_map_type::HostMirror rowmapHost("rowmap host", A.numRows() + 1); + typename crs_matrix::index_type::HostMirror entriesHost("entries host", A.nnz()); + crs_graph_host graphHost(entriesHost, rowmapHost); + //Test the two CrsMatrix constructors that take the StaticCrsGraph + crs_matrix_host Ahost1("Ahost1", graphHost); + crs_matrix_host Ahost2("Ahost2", A.numCols(), valuesHost, graphHost); + //Test deep copy constructor (can copy between any two spaces) + { + crs_matrix Bdev("B device", Ahost1); + crs_matrix_host Bhost("B host", A); + } + //Test the empty (0x0, 0 entries) case - zero-length rowmap. + typename crs_graph::row_map_type::non_const_type zeroRowmap; + typename crs_graph::entries_type zeroEntries; + typename crs_matrix::values_type zeroValues; + crs_matrix zero("ZeroRow", 0, 0, 0, zeroValues, zeroRowmap, zeroEntries); + crs_matrix_host zeroHost("zero1Host", zero); + EXPECT_EQ(zeroHost.numRows(), 0); + EXPECT_EQ(zeroHost.numCols(), 0); + EXPECT_EQ(zeroHost.nnz(), 0); + EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0); +} + #define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( TestCategory, sparse ## _ ## crsmatrix ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ testCrsMatrix (); \ +} \ +TEST_F( TestCategory, sparse ## _ ## crsmatrix_host_mirror ## _ ## SCALAR ## _ ## ORDINAL ## _ ## OFFSET ## _ ## DEVICE ) { \ + testCrsMatrixHostMirror (); \ } diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index 9993d46e22..cbdb673bb1 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -97,6 +97,8 @@ int run_gauss_seidel( KernelHandle; + scalar_t omega(0.9); + KernelHandle kh; kh.set_team_work_size(16); kh.set_dynamic_scheduling(true); @@ -106,6 +108,11 @@ int run_gauss_seidel( // test for two-stage/classical gs kh.create_gs_handle(gs_algorithm); kh.set_gs_twostage(!classic, input_mat.numRows()); + if (classic) { + // two-stage with SpTRSV supports only omega = one + const scalar_t one = Kokkos::Details::ArithTraits::one (); + omega = one; + } } else kh.create_gs_handle(GS_DEFAULT); @@ -120,8 +127,6 @@ int run_gauss_seidel( gauss_seidel_numeric (&kh, num_rows_1, num_cols_1, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); - scalar_t omega(0.9); - switch (apply_type){ case 0: symmetric_gauss_seidel_apply