diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 72152f749a..6c721e3d54 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -2,12 +2,19 @@ name: github-OSX on: pull_request: - branches: - - master - - develop + types: [ opened, labeled, unlabeled, reopened, synchronize ] jobs: + check-pr-labels: + runs-on: [ubuntu-latest] + steps: + - uses: docker://agilepathway/pull-request-label-checker:latest + with: + none_of: 'AT: WIP' + repo_token: ${{ secrets.GITHUB_TOKEN }} osxci: + needs: check-pr-labels + # TODO: allow re-run via retest label if: ${{ github.event.label.name == 'AT: RETEST' }} name: osx-ci runs-on: [macos-latest] @@ -16,12 +23,16 @@ jobs: include: - backend: "SERIAL" cmake_build_type: "RelWithDebInfo" + debug_bounds_check: "ON" - backend: "THREADS" cmake_build_type: "RelWithDebInfo" + debug_bounds_check: "ON" - backend: "SERIAL" cmake_build_type: "Debug" + debug_bounds_check: "OFF" - backend: "SERIAL" cmake_build_type: "Release" + debug_bounds_check: "ON" steps: - name: checkout_kokkos_kernels @@ -46,6 +57,7 @@ jobs: -DCMAKE_CXX_FLAGS="-Werror" \ -DCMAKE_CXX_STANDARD=14 \ -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEBUG_BOUNDS_CHECK:BOOL=${{ matrix.debug_bounds_check }} \ -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ -DCMAKE_INSTALL_PREFIX=$PWD/../install \ @@ -73,6 +85,8 @@ jobs: -DKokkosKernels_INST_FLOAT=ON \ -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \ -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + -DKokkosKernels_INST_OFFSET_SIZE_T=ON \ -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ .. @@ -83,4 +97,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure \ No newline at end of file + run: ctest -j2 --output-on-failure --timeout 3600 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 97dce4835d..46c4eeaf5f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,119 @@ # Change Log +## [3.7.00](https://github.com/kokkos/kokkos-kernels/tree/3.7.00) (2022-08-18) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.01...3.7.00) + +### Features: + +#### Final Bsr algorithms implemented for multigrid: +- Sparse: bsr transpose algorithm [\#1477](https://github.com/kokkos/kokkos-kernels/pull/1477) +- BSR block SpGEMM implementation [\#1099](https://github.com/kokkos/kokkos-kernels/pull/1099) + +#### Adding batched dense linear and non-linear system solvers: +- Add batched GESV [\#1384](https://github.com/kokkos/kokkos-kernels/pull/1384) +- Newton solver: serial on device implementation of Newton's method [\#1479](https://github.com/kokkos/kokkos-kernels/pull/1479) + +#### Add sparse matrix conversion: +- Add csc2csr [\#1342](https://github.com/kokkos/kokkos-kernels/pull/1342) +- csc2csr: update Kokkos_Numeric.hpp header inclusion [\#1449](https://github.com/kokkos/kokkos-kernels/pull/1449) +- sparse: Remove csc2csr copy [\#1375](https://github.com/kokkos/kokkos-kernels/pull/1375) + +#### New documentation in readthedocs +- Added https://kokkos-kernels.readthedocs.io [\#1451](https://github.com/kokkos/kokkos-kernels/pull/1451) +- Restructure docs [\#1368](https://github.com/kokkos/kokkos-kernels/pull/1368) + +#### Fix issues with TPLs for mutlivector SPMV +- Add cuSparse TPL files for CrsMatrix-multivector product [\#1427](https://github.com/kokkos/kokkos-kernels/pull/1427) + +### Deprecations: +- Add template params to forwarding calls in deprecated KokkosKernels::… [\#1441](https://github.com/kokkos/kokkos-kernels/pull/1441) + +### Implemented enhancements: + +#### +- SPILUK: Move host allocations to symbolic [\#1480](https://github.com/kokkos/kokkos-kernels/pull/1480) +- trsv: remove assumptions about entry order within rows [\#1463](https://github.com/kokkos/kokkos-kernels/pull/1463) + +#### Hierarchical BLAS algorithms, added and moved from batched: +- Blas serial axpy and nrm2 [\#1460](https://github.com/kokkos/kokkos-kernels/pull/1460) +- Move Set/Scale unit test to KokkosBlas [\#1455](https://github.com/kokkos/kokkos-kernels/pull/1455) +- Move {Serial,Team,TeamVector} Set to KokkosBlas [\#1454](https://github.com/kokkos/kokkos-kernels/pull/1454) +- Move {Serial,Team,TeamVector}Scale to KokkosBlas [\#1448](https://github.com/kokkos/kokkos-kernels/pull/1448) + +#### Code base organization and clean-ups: +- Common Utils: removing dependency on Sparse Utils in Common Utils [\#1436](https://github.com/kokkos/kokkos-kernels/pull/1436) +- Common cleanup [\#1431](https://github.com/kokkos/kokkos-kernels/pull/1431) +- Clean-up src: re-organizing the src directory [\#1398](https://github.com/kokkos/kokkos-kernels/pull/1398) +- Sparse utils namespace [\#1439](https://github.com/kokkos/kokkos-kernels/pull/1439) + +#### perf tests updates, fixes and clean-ups: +- dot perf test: adding support for HIP and SYCL backend [\#1453](https://github.com/kokkos/kokkos-kernels/pull/1453) +- Add verbosity parameter to GMRES example. Turn off for testing. [\#1385](https://github.com/kokkos/kokkos-kernels/pull/1385) +- KokkosSparse_spiluk.cpp perf test: add int-int guards to cusparse codes [\#1369](https://github.com/kokkos/kokkos-kernels/pull/1369) +- perf_test/blas: Check ARMPL build version [\#1352](https://github.com/kokkos/kokkos-kernels/pull/1352) +- Clean-up batched block tridiag perf test [\#1343](https://github.com/kokkos/kokkos-kernels/pull/1343) +- Reduce lots of macro duplication in sparse unit tests [\#1340](https://github.com/kokkos/kokkos-kernels/pull/1340) + +#### Infrastructure changes: ETI and testing upgrades, minor fixes +- sycl: re-enabling test now that dpcpp has made progress [\#1473](https://github.com/kokkos/kokkos-kernels/pull/1473) +- Only instantiate Kokkos's default Cuda mem space [\#1361](https://github.com/kokkos/kokkos-kernels/pull/1361) +- Sparse and CI updates [\#1411](https://github.com/kokkos/kokkos-kernels/pull/1411) +- Newer sparse tests were not following the new testing pattern [\#1356](https://github.com/kokkos/kokkos-kernels/pull/1356) +- Add ETI for D1 coloring [\#1401](https://github.com/kokkos/kokkos-kernels/pull/1401) +- Add ETI to SpAdd (symbolic and numeric) [\#1399](https://github.com/kokkos/kokkos-kernels/pull/1399) +- Reformat example/fenl files changed in 1382 [\#1464](https://github.com/kokkos/kokkos-kernels/pull/1464) +- Change Controls::getParameter error message from stdout to stderr [\#1416](https://github.com/kokkos/kokkos-kernels/pull/1416) + +#### Kokkos alignment: update our implementations to use newer Kokkos features +- Arith traits integral nan [\#1438](https://github.com/kokkos/kokkos-kernels/pull/1438) +- Kokkos_ArithTraits: re-implementation using Kokkos Core [\#1406](https://github.com/kokkos/kokkos-kernels/pull/1406) +- Value-initialize result of MaxLoc reduction to avoid maybe uninitialized warning [\#1383](https://github.com/kokkos/kokkos-kernels/pull/1383) +- Remove volatile qualifiers in reducer join(), init(), and operator+= methods [\#1382](https://github.com/kokkos/kokkos-kernels/pull/1382) + +#### BLAS and batched algorithms updates +- Update Batched GMRES [\#1392](https://github.com/kokkos/kokkos-kernels/pull/1392) +- GEMV: accumulate in float for scalar = bhalf_t [\#1360](https://github.com/kokkos/kokkos-kernels/pull/1360) +- Restore BLAS-1 MV paths for 1 column [\#1354](https://github.com/kokkos/kokkos-kernels/pull/1354) + +#### Sparse and Graph updates +- Minor updates to cluster Gauss-Seidel [\#1372](https://github.com/kokkos/kokkos-kernels/pull/1372) +- Add unit test for BsrMatrix and BlockCrsMatrix spmv [\#1338](https://github.com/kokkos/kokkos-kernels/pull/1338) +- Refactor SPGEMM MKL Impl [\#1244](https://github.com/kokkos/kokkos-kernels/pull/1244) +- D1 coloring: remove unused but set variable [\#1403](https://github.com/kokkos/kokkos-kernels/pull/1403) + +#### half precision paper +- Minor changes for half precision paper [\#1429](https://github.com/kokkos/kokkos-kernels/pull/1429) +- Add benchmarks for us-rse escience 2022 half precision paper [\#1422](https://github.com/kokkos/kokkos-kernels/pull/1422) + + +### Bug Fixes: +- TPLs: adding CUBLAS in the list of dependencies [\#1482](https://github.com/kokkos/kokkos-kernels/pull/1482) +- Fix MKL build errors [\#1478](https://github.com/kokkos/kokkos-kernels/pull/1478) +- Fixup drop layout template param in rank-0 views [\#1476](https://github.com/kokkos/kokkos-kernels/pull/1476) +- BLAS: fixing test that access results before synching [\#1472](https://github.com/kokkos/kokkos-kernels/pull/1472) +- Fix D1 color ETI with both CudaSpace and UVM [\#1471](https://github.com/kokkos/kokkos-kernels/pull/1471) +- Fix arithtraits warning [\#1468](https://github.com/kokkos/kokkos-kernels/pull/1468) +- Fix build when double not instantiated [\#1467](https://github.com/kokkos/kokkos-kernels/pull/1467) +- Fix -Werror [\#1466](https://github.com/kokkos/kokkos-kernels/pull/1466) +- Fix GitHub CI failing on broken develop [\#1461](https://github.com/kokkos/kokkos-kernels/pull/1461) +- HIP: fix warning from ExecSpaceUtils and GEMV [\#1459](https://github.com/kokkos/kokkos-kernels/pull/1459) +- Removes a duplicate cuda_data_type_from when KOKKOS_HALF_T_IS_FLOAT [\#1456](https://github.com/kokkos/kokkos-kernels/pull/1456) +- Fix incorrect function call in KokkosBatched::TeamGEMV unit test [\#1444](https://github.com/kokkos/kokkos-kernels/pull/1444) +- Fix SYCL nightly test [\#1419](https://github.com/kokkos/kokkos-kernels/pull/1419) +- Fix issues with cuSparse TPL availability for BsrMatrix SpMV [\#1418](https://github.com/kokkos/kokkos-kernels/pull/1418) +- SpMV: fixing issues with unit-tests tolerance [\#1412](https://github.com/kokkos/kokkos-kernels/pull/1412) +- Address 1409 [\#1410](https://github.com/kokkos/kokkos-kernels/pull/1410) +- Fix colliding include guards (copy-paste mistake) [\#1408](https://github.com/kokkos/kokkos-kernels/pull/1408) +- src/sparse: Fix & check for fence post errors [\#1405](https://github.com/kokkos/kokkos-kernels/pull/1405) +- Bspgemm fixes [\#1396](https://github.com/kokkos/kokkos-kernels/pull/1396) +- Fix unused parameter warnings in GEMM test. [\#1381](https://github.com/kokkos/kokkos-kernels/pull/1381) +- Fixes code deprecation warnings. [\#1379](https://github.com/kokkos/kokkos-kernels/pull/1379) +- Fix sign-compare warning in SPMV perf test [\#1371](https://github.com/kokkos/kokkos-kernels/pull/1371) +- Minor MKL fixes [\#1365](https://github.com/kokkos/kokkos-kernels/pull/1365) +- perf_test/batched: Temporarily disable tests [\#1359](https://github.com/kokkos/kokkos-kernels/pull/1359) +- Fix nightly builds following promotion of the math functions in Kokkos [\#1339](https://github.com/kokkos/kokkos-kernels/pull/1339) + + ## [3.6.01](https://github.com/kokkos/kokkos-kernels/tree/3.6.01) (2022-05-23) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.6.00...3.6.01) diff --git a/CMakeLists.txt b/CMakeLists.txt index ba5323df27..40d6dd407b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,8 +24,8 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 6) - SET(KokkosKernels_VERSION_PATCH 01) + SET(KokkosKernels_VERSION_MINOR 7) + SET(KokkosKernels_VERSION_PATCH 00) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") ENDIF() @@ -35,7 +35,7 @@ CMAKE_POLICY(SET CMP0074 NEW) INCLUDE(GNUInstallDirs) IF (KOKKOSKERNELS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) + SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) SET(KOKKOSKERNELS_HEADER_INSTALL_DIR ${TRILINOS_INCDIR}) SET(KOKKOS_ENABLE_CUDA_UVM ${Kokkos_ENABLE_CUDA_UVM}) ELSEIF(KOKKOSKERNELS_HAS_PARENT) diff --git a/README.md b/README.md index 08f80c19d6..58127b912e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Generic badge](https://readthedocs.org/projects/pip/badge/?version=latest&style=flat)](https://kokkos-kernels.readthedocs.io/en/latest/) + ![KokkosKernels](https://avatars2.githubusercontent.com/u/10199860?s=200&v=4) # Kokkos Kernels diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index b26ba7be97..ee195ca0fe 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -230,7 +230,7 @@ display_help_text() { echo "--with-openmptarget: Enable OpenMPTarget backend." echo "--with-sycl: Enable Sycl backend." echo "--with-openmp: Enable OpenMP backend." - echo "--with-pthread: Enable Pthreads backend." + echo "--with-threads: Enable Threads backend." echo "--with-serial: Enable Serial backend." echo "--with-devices: Explicitly add a set of backends." echo "" @@ -274,6 +274,8 @@ display_help_text() { echo " Pascal61 = NVIDIA Pascal generation CC 6.1" echo " Volta70 = NVIDIA Volta generation CC 7.0" echo " Volta72 = NVIDIA Volta generation CC 7.2" + echo " Ampere80 = NVIDIA Ampere generation CC 8.0" + echo " Ampere86 = NVIDIA Ampere generation CC 8.6" echo "" echo "--compiler=/Path/To/Compiler Set the compiler." echo "" @@ -335,6 +337,7 @@ display_help_text() { echo "--kokkos-make-j=[NUM]: Set -j parallel level for kokkos install" echo " Default: j == 4" echo "--enable-tests: build Kokkos Kernels unit and performance tests" + echo "--deprecated-code Enable deprecated code (disabled by default)" echo "--enable-perfsuite: build Kokkos Kernels performance tests with RAJAPerf Suite" @@ -360,6 +363,8 @@ KERNELS_DEFAULT_ETI_OPTION="" WITH_CUDA_BACKEND=OFF WITH_HIP_BACKEND=OFF +KOKKOS_DEPRECATED_CODE=OFF + while [[ $# > 0 ]] do key="$1" @@ -415,8 +420,8 @@ do --with-sycl) update_kokkos_devices Sycl ;; - --with-pthread) - update_kokkos_devices Pthread + --with-threads) + update_kokkos_devices Threads ;; --with-serial) update_kokkos_devices Serial @@ -522,6 +527,9 @@ do --disable-examples) KOKKOSKERNELS_DO_EXAMPLES=OFF ;; + --deprecated-code) + KOKKOS_DEPRECATED_CODE=ON + ;; --compiler*) COMPILER="${key#*=}" CNUM=$(command -v ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l) @@ -738,9 +746,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF -DKokkos_ENABLE_DEPRECATED_CODE_3=${KOKKOS_DEPRECATED_CODE} ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 2dcedcc1c9..e8b1c6a5e2 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,7 +1,12 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE CUBLAS TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in -# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake. \ No newline at end of file +# the macro 'KOKKOSKERNELS_ADD_TPL_OPTION' that resides in cmake/kokkoskernels_tpls.cmake. + +if (TPL_ENABLE_CUDA) + tribits_tpl_tentatively_enable(CUBLAS) +endif() + diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index f8dd2ae133..1fb6a31544 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -70,6 +70,8 @@ #cmakedefine KOKKOSKERNELS_INST_FLOAT /* Whether to build kernels for scalar type Kokkos::Experimental::half_t */ #cmakedefine KOKKOSKERNELS_INST_HALF +/* Whether to build kernels for scalar type Kokkos::Experimental::bhalf_t */ +#cmakedefine KOKKOSKERNELS_INST_BHALF /* Whether to build kernels for scalar type complex */ #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE /* Whether to build kernels for scalar type complex */ diff --git a/cmake/Modules/FindTPLMKL.cmake b/cmake/Modules/FindTPLMKL.cmake index 5766e0f5b0..56f4f34c9e 100644 --- a/cmake/Modules/FindTPLMKL.cmake +++ b/cmake/Modules/FindTPLMKL.cmake @@ -41,6 +41,10 @@ ELSE() LIBRARY_PATHS ${MKL_ROOT}/lib/intel64 ${ENV_LIBDIRS} + HEADER + mkl.h + HEADER_PATHS + ${MKL_ROOT}/include ) ENDIF() ENDIF() diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index 47dce1f9d1..9395cec564 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -41,19 +41,29 @@ SET(MEMSPACE_HBWSPACE_CPP_TYPE Kokkos::HBWSpace) IF(KOKKOS_ENABLE_CUDA) KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_CUDA - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ON BOOL "Whether to pre instantiate kernels for the execution space Kokkos::Cuda. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) + + # By default, instantiate only for Cuda's default memory space (either CudaSpace, or CudaUVMSpace). + IF(KOKKOS_ENABLE_CUDA_UVM) + SET(CUDA_CUDAUVMSPACE_DEFAULT ON) + SET(CUDA_CUDASPACE_DEFAULT OFF) + ELSE() + SET(CUDA_CUDAUVMSPACE_DEFAULT OFF) + SET(CUDA_CUDASPACE_DEFAULT ON) + ENDIF() + KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDAUVMSPACE - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ${CUDA_CUDAUVMSPACE_DEFAULT} BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaUVMSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) KOKKOSKERNELS_ADD_OPTION( INST_MEMSPACE_CUDASPACE - ${KOKKOSKERNELS_INST_EXECSPACE_CUDA_DEFAULT} + ${CUDA_CUDASPACE_DEFAULT} BOOL "Whether to pre instantiate kernels for the memory space Kokkos::CudaSpace. Disabling this when Kokkos_ENABLE_CUDA is enabled may increase build times. Default: ON if Kokkos is CUDA-enabled, OFF otherwise." ) diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake index debf99bb0e..3448874336 100644 --- a/cmake/kokkoskernels_eti_floats.cmake +++ b/cmake/kokkoskernels_eti_floats.cmake @@ -25,6 +25,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t. Disabling this may increase build times. Default: OFF" ) +KOKKOSKERNELS_ADD_OPTION( + INST_BHALF + OFF + BOOL + "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::bhalf_t. Disabling this may increase build times. Default: OFF" +) + SET(FLOATS FLOAT DOUBLE @@ -33,6 +40,7 @@ SET(FLOATS SET(DOUBLE_CPP_TYPE "double") SET(FLOAT_CPP_TYPE "float") SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t") +SET(BHALF_CPP_TYPE "Kokkos::Experimental::bhalf_t") SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex") SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex") diff --git a/cmake/kokkoskernels_eti_offsets.cmake b/cmake/kokkoskernels_eti_offsets.cmake index 171223010c..484175a976 100644 --- a/cmake/kokkoskernels_eti_offsets.cmake +++ b/cmake/kokkoskernels_eti_offsets.cmake @@ -1,5 +1,5 @@ SET(KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) -SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT ${KOKKOSKERNELS_ADD_DEFAULT_ETI}) +SET(KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT OFF) SET(OFFSETS OFFSET_INT OFFSET_SIZE_T @@ -12,14 +12,14 @@ KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_INT ${KOKKOSKERNELS_INST_OFFSET_INT_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=ON by default. Default: ON" + "Whether to pre instantiate kernels for the offset type int. This option is KokkosKernels_INST_OFFSET_INT=OFF by default. Default: OFF" ) KOKKOSKERNELS_ADD_OPTION( INST_OFFSET_SIZE_T ${KOKKOSKERNELS_INST_OFFSET_SIZE_T_DEFAULT} BOOL - "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=OFF by default. Default: ON" + "Whether to pre instantiate kernels for the offset type size_t. This option is KokkosKernels_INST_OFFSET_SIZE_T=ON by default. Default: ON" ) IF (KOKKOSKERNELS_INST_OFFSET_INT) diff --git a/docs/conf.py b/docs/conf.py index efb406329b..59377e4f11 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -18,11 +18,11 @@ # -- Project information ----------------------------------------------------- project = 'Kokkos Kernels' -copyright = '2021, Evan Harvey' -author = 'Evan Harvey' +copyright = '2022, Kokkos Development Team' +author = 'Kokkos Team' # The full version, including alpha/beta/rc tags -release = 'v3.4.1' +release = 'latest' # -- General configuration --------------------------------------------------- diff --git a/docs/developer/apidocs.rst b/docs/developer/apidocs.rst new file mode 100644 index 0000000000..82797c5801 --- /dev/null +++ b/docs/developer/apidocs.rst @@ -0,0 +1,14 @@ +Source Code Documentation +========================= + +The source documentation is extracted from the C++ files using Doxygen. + +.. toctree:: + :maxdepth: 4 + + apidocs/blas1 + apidocs/blas2 + apidocs/blas3 + apidocs/sparse + apidocs/batched_dense + apidocs/batched_sparse \ No newline at end of file diff --git a/docs/developer/apidocs/batched_dense.rst b/docs/developer/apidocs/batched_dense.rst new file mode 100644 index 0000000000..1d65842061 --- /dev/null +++ b/docs/developer/apidocs/batched_dense.rst @@ -0,0 +1,257 @@ +BATCHED -- KokkosKernels batched functor-level interfaces +========================================================= + +innerlu +------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerLU_Internal.hpp + +applypivot +---------- +.. doxygenstruct:: KokkosBatched::TeamVectorApplyPivot + :members: + +qr_withcolumnpivoting +--------------------- +.. doxygenstruct:: KokkosBatched::TeamVectorQR_WithColumnPivoting + :members: + +addradial +--------- +.. doxygenstruct:: KokkosBatched::SerialAddRadial + :members: +.. doxygenstruct:: KokkosBatched::TeamAddRadial + :members: + +householder +----------- +.. doxygenstruct:: KokkosBatched::SerialHouseholder + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorHouseholder + :members: + +set +--- +.. doxygenstruct:: KokkosBatched::SerialSet + :members: +.. doxygenstruct:: KokkosBatched::TeamSet + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorSet + :members: + +scale +----- +.. doxygenstruct:: KokkosBatched::SerialScale + :members: +.. doxygenstruct:: KokkosBatched::TeamScale + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorScale + :members: + +setidentity +----------- +.. doxygenstruct:: KokkosBatched::SerialSetIdentity + :members: +.. doxygenstruct:: KokkosBatched::TeamSetIdentity + :members: +.. doxygenstruct:: KokkosBatched::SetIdentity + :members: + +applyhouseholder +---------------- +.. doxygenstruct:: KokkosBatched::SerialApplyHouseholder + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorApplyHouseholder + :members: + +innermultipledotproduct +----------------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerMultipleDotProduct_Internal.hpp + +lu +-- +.. doxygenstruct:: KokkosBatched::SerialLU + :members: +.. doxygenstruct:: KokkosBatched::TeamLU + :members: +.. doxygenstruct:: KokkosBatched::LU + :members: + +solveutv +-------- +.. doxygenstruct:: KokkosBatched::TeamVectorSolveUTV + :members: + +utv +--- +.. doxygenstruct:: KokkosBatched::TeamVectorUTV + :members: + +inverselu +--------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InverseLU_Internal.hpp + +svd +--- +.. doxygenstruct:: KokkosBatched::SerialSVD + :members: + +eigendecomposition +------------------ +.. doxygenstruct:: KokkosBatched::SerialEigendecomposition + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorEigendecomposition + :members: + +trtri +----- +.. doxygenstruct:: KokkosBatched::SerialTrtri + :members: + +qr +-- +.. doxygenstruct:: KokkosBatched::SerialQR + :members: +.. doxygenstruct:: KokkosBatched::TeamQR + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorQR + :members: +.. doxygenstruct:: KokkosBatched::QR + :members: + +trmm +---- +.. doxygenstruct:: KokkosBatched::SerialTrmm + :members: + +trsm +---- +.. doxygenstruct:: KokkosBatched::SerialTrsm + :members: +.. doxygenstruct:: KokkosBatched::TeamTrsm + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorTrsm + :members: +.. doxygenstruct:: KokkosBatched::Trsm + :members: + +innergemmfixa +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixA_Internal.hpp + +innergemmfixb +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixB_Internal.hpp + +innergemmfixc +------------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerGemmFixC_Internal.hpp + +applyq +------ +.. doxygenstruct:: KokkosBatched::SerialApplyQ + :members: +.. doxygenstruct:: KokkosBatched::TeamApplyQ + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorApplyQ + :members: +.. doxygenstruct:: KokkosBatched::ApplyQ + :members: + +copy +---- +.. doxygenstruct:: KokkosBatched::SerialCopy + :members: +.. doxygenstruct:: KokkosBatched::TeamCopy + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorCopy + :members: +.. doxygenstruct:: KokkosBatched::Copy + :members: + +innertrsm +--------- +CodeCleanup-TODO: Move Decl file to dense/impl/KokkosBatched_InnerTrsm_Internal.hpp + +solvelu +------- +.. doxygenstruct:: KokkosBatched::SerialSolveLU + :members: +.. doxygenstruct:: KokkosBatched::TeamSolveLU + :members: +.. doxygenstruct:: KokkosBatched::SolveLU + :members: + +xpay +---- +.. doxygenstruct:: KokkosBatched::SerialXpay + :members: +.. doxygenstruct:: KokkosBatched::TeamXpay + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorXpay + :members: + +axpy +---- +.. doxygenstruct:: KokkosBatched::SerialAxpy + :members: +.. doxygenstruct:: KokkosBatched::TeamAxpy + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorAxpy + :members: + +gemv +---- +.. doxygenstruct:: KokkosBatched::SerialGemv + :members: +.. doxygenstruct:: KokkosBatched::TeamGemv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorGemv + :members: +.. doxygenstruct:: KokkosBatched::Gemv + :members: + +dot +--- +.. doxygenstruct:: KokkosBatched::SerialDot + :members: +.. doxygenstruct:: KokkosBatched::TeamDot + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorDot + :members: + +hadamardproduct +--------------- +.. doxygenstruct:: KokkosBatched::SerialHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::TeamHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorHadamardProduct + :members: +.. doxygenstruct:: KokkosBatched::HadamardProduct + :members: + +vector +------ +CodeCleanup-TODO: Move Decl file to dense/impl/ + +trsv +---- +.. doxygenstruct:: KokkosBatched::SerialTrsv + :members: +.. doxygenstruct:: KokkosBatched::TeamTrsv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorTrsv + :members: +.. doxygenstruct:: KokkosBatched::Trsv + :members: + +gemm +---- +.. doxygenstruct:: KokkosBatched::SerialGemm + :members: +.. doxygenstruct:: KokkosBatched::TeamGemm + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorGemm + :members: +.. doxygenstruct:: KokkosBatched::Gemm + :members: \ No newline at end of file diff --git a/docs/developer/apidocs/batched_sparse.rst b/docs/developer/apidocs/batched_sparse.rst new file mode 100644 index 0000000000..48031bc550 --- /dev/null +++ b/docs/developer/apidocs/batched_sparse.rst @@ -0,0 +1,43 @@ +SPARSE BATCHED -- KokkosKernels sparse batched functor-level interfaces +======================================================================= + +cg +-- +.. doxygenstruct:: KokkosBatched::CG + :members: + +crsmatrix +--------- +.. doxygenclass:: KokkosBatched::CrsMatrix + :members: + +gmres +----- +.. doxygenstruct:: KokkosBatched::GMRES + :members: + +identity +-------- +.. doxygenclass:: KokkosBatched::Identity + :members: + +jacobiprec +---------- +.. doxygenclass:: KokkosBatched::JacobiPrec + :members: + +krylovhandle +------------ +.. doxygenclass:: KokkosBatched::KrylovHandle + :members: + +spmv +---- +.. doxygenstruct:: KokkosBatched::SerialSpmv + :members: +.. doxygenstruct:: KokkosBatched::TeamSpmv + :members: +.. doxygenstruct:: KokkosBatched::TeamVectorSpmv + :members: +.. doxygenstruct:: KokkosBatched::Spmv + :members: \ No newline at end of file diff --git a/docs/developer/apidocs/blas1.rst b/docs/developer/apidocs/blas1.rst new file mode 100644 index 0000000000..bfeb7fd1bb --- /dev/null +++ b/docs/developer/apidocs/blas1.rst @@ -0,0 +1,55 @@ +BLAS1 -- KokkosKernels blas1 interfaces +======================================= + +axpby +----- +.. doxygenfunction:: KokkosBlas::axpby + +dot +--- +.. doxygenfunction:: KokkosBlas::dot(const RV &, const XMV &, const YMV &, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::dot(const XVector &, const YVector &) + +fill +---- +.. doxygenfunction:: KokkosBlas::fill + +mult +---- +.. doxygenfunction:: KokkosBlas::mult + +nrm1 +---- +.. doxygenfunction:: KokkosBlas::nrm1(const RV &, const XMV &, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm1(const XVector &) + +nrm2 +---- +.. doxygenfunction:: KokkosBlas::nrm2(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm2(const XVector &x) + +nrm2w +----- +.. doxygenfunction:: KokkosBlas::nrm2w(const RV &R, const XMV &X, const XMV &W, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrm2w(const XVector &x, const XVector &w) + +nrminf +------ +.. doxygenfunction:: KokkosBlas::nrminf(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) +.. doxygenfunction:: KokkosBlas::nrminf(const XVector &x) + +reciprocal +---------- +.. doxygenfunction:: KokkosBlas::reciprocal + +scal +---- +.. doxygenfunction:: KokkosBlas::scal + +sum +--- +.. doxygenfunction:: KokkosBlas::sum(const RV &R, const XMV &X, typename std::enable_if::value, int>::type = 0) + +update +------ +.. doxygenfunction:: KokkosBlas::update diff --git a/docs/developer/apidocs/blas2.rst b/docs/developer/apidocs/blas2.rst new file mode 100644 index 0000000000..1d9a3f3fa7 --- /dev/null +++ b/docs/developer/apidocs/blas2.rst @@ -0,0 +1,7 @@ +BLAS2 -- KokkosKernels blas2 interfaces +======================================= + +gemv +---- +.. doxygenfunction:: KokkosBlas::gemv(const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) +.. doxygenfunction:: KokkosBlas::gemv(const typename AViewType::execution_space &space, const char trans[], typename AViewType::const_value_type &alpha, const AViewType &A, const XViewType &x, typename YViewType::const_value_type &beta, const YViewType &y) diff --git a/docs/developer/apidocs/blas3.rst b/docs/developer/apidocs/blas3.rst new file mode 100644 index 0000000000..810b28a5a3 --- /dev/null +++ b/docs/developer/apidocs/blas3.rst @@ -0,0 +1,8 @@ +BLAS3 -- KokkosKernels blas3 interfaces +======================================= + +gemm +---- +.. doxygenfunction:: KokkosBlas::gemm(const char transA, const char transB, AMat::const_value_type alpha, const AMat &a, const BMat &b, CMat::const_value_type beta, const CMat &c) +.. doxygenfunction:: KokkosBlas::gemm(const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) +.. doxygenfunction:: KokkosBlas::gemm(const typename CViewType::execution_space &space, const char transA[], const char transB[], typename AViewType::const_value_type &alpha, const AViewType &A, const BViewType &B, typename CViewType::const_value_type &beta, const CViewType &C) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst new file mode 100644 index 0000000000..84ec48a519 --- /dev/null +++ b/docs/developer/apidocs/sparse.rst @@ -0,0 +1,27 @@ +SPARSE -- KokkosKernels sparse interfaces +========================================= + +crsmatrix +--------- +.. doxygenclass:: KokkosSparse::CrsMatrix + :members: + +spmv +---- +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char[], const AlphaType&, const AMatrix&, const XVector&, const BetaType&, const YVector&) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_ONE) +.. doxygenfunction:: KokkosSparse::spmv(KokkosKernels::Experimental::Controls, const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y, const RANK_TWO) +.. doxygenfunction:: KokkosSparse::spmv(const char mode[], const AlphaType &alpha, const AMatrix &A, const XVector &x, const BetaType &beta, const YVector &y) + +trsv +---- +.. doxygenfunction:: KokkosSparse::trsv + +spgemm +------ +.. doxygenfunction:: KokkosSparse::spgemm + +gauss +----- +.. doxygenfunction:: KokkosSparse::gauss diff --git a/docs/developer/build_doc.rst b/docs/developer/build_doc.rst new file mode 100644 index 0000000000..dd3d357286 --- /dev/null +++ b/docs/developer/build_doc.rst @@ -0,0 +1,18 @@ +Building Developer Documentation +================================ + +.. code-block:: + :caption: Installing dependencies on MacOS + + brew install doxygen + pip install sphinx + pip install breathe + pip install sphinx-rtd-theme + +.. code-block:: + :caption: How to build developer documentation + + cmake -DKokkosKernels_ENABLE_DOCS:BOOL=ON /path/to/kokkos-kernels + make Doxygen + make Sphinx + open build/docs/docs/sphinx/index.html \ No newline at end of file diff --git a/docs/developer/contrib.rst b/docs/developer/contrib.rst new file mode 100644 index 0000000000..0b02ebf190 --- /dev/null +++ b/docs/developer/contrib.rst @@ -0,0 +1,46 @@ +Contributing +============ + +Comment Style +------------- +We follow doxygen style comments for both external (API) and internal members. See https://www.doxygen.nl/manual/docblocks.html for details. +Our documentation can be generated using the `-DKokkosKernels_ENABLE_DOCS:BOOL=ON` cmake flag; see `Building the Documentation`. + +In general, we prefer that the prototype has the doxygen style comment rather than the definition. If there is no prototype, then the definition should have the doxygen style comment. + +.. code-block:: + :caption: API Doxygen Style Example + + /// \brief Blocking wrapper for accessing a Kokkos View. + /// \tparam ViewValueType The value type (Scalar or Vector) of each view element + /// \tparam ViewType The view type + /// \param v The view handle + /// \param m The requested row index of v + /// \param n The requested col index of v + /// \return If m and n are within the extents of v, a valid element of v; + /// otherwise, the last element of v. + /// + template + KOKKOS_INLINE_FUNCTION ViewValueType + access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &); + +Library policies +---------------- + +System-specific functions +------------------------- +For portability, any system-specific function that is not in the C++ standard should not be invoked from kokkos-kernels. + +Upcasting and downcasting +------------------------- +TODO + +Blocking and non-blocking interfaces +------------------------------------ +All the APIs are non-blocking unless: +1. A TPL is enabled +2. The result vector resides on the host and work is offloaded to a device + +When a TPL is enabled, we follow the blocking semantics of the TPL interface. + +If no TPLs are enabled, callers can avoid blocking calls by using any overload which accepts a result vector type as a template argument. \ No newline at end of file diff --git a/docs/developer/index.rst b/docs/developer/index.rst new file mode 100644 index 0000000000..7ee05f98ae --- /dev/null +++ b/docs/developer/index.rst @@ -0,0 +1,10 @@ +Developer Manual +================ + +.. toctree:: + :maxdepth: 1 + + Source Code Documentation + Building the Documentation + Code Style Guide + Contributing \ No newline at end of file diff --git a/docs/developer/style.rst b/docs/developer/style.rst new file mode 100644 index 0000000000..ddd9ce5197 --- /dev/null +++ b/docs/developer/style.rst @@ -0,0 +1,34 @@ +Style Guide +=========== + +We follow google's c++ coding style. See https://google.github.io/styleguide/cppguide.html and https://github.com/kokkos/kokkos-kernels/blob/master/.clang-format for details. + +.. code-block:: + :caption: Automate coding style via a pre-commit hook + + cat kokkos-kernels/.git/hooks/pre-commit + for FILE in $(git diff --cached --name-only | egrep '.*\.cpp$|.*\.hpp$|.*\.h$') + do + if [ -e $file ]; then + clang-format-8 -i -style=file $FILE + git add $FILEA + fi + done + chmod +x kokkos-kernels/.git/hooks/pre-commit + +.. code-block:: + :caption: Conditionally enable or disable formatting + + // clang-format off + cpp code here + // clang-format on + +.. code-block:: + :caption: Instal clang-format on MacOS + + brew install clang-format-8 + +.. code-block:: + :caption: Instal clang-format on Ubuntu + + apt install clang-format-8 \ No newline at end of file diff --git a/docs/developer/write_developer_doc.rst b/docs/developer/write_developer_doc.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/developer/write_user_doc.rst b/docs/developer/write_user_doc.rst new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/index.rst b/docs/index.rst index 06240595bf..db873e9a3b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,33 +1,8 @@ -.. Kokkos Kernels documentation master file, created by - sphinx-quickstart on Fri Sep 24 13:19:45 2021. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to Kokkos Kernels's documentation! +Kokkos Kernels documentation: Under Construction ========================================== - .. toctree:: :maxdepth: 2 - :caption: Contents: - - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - -Docs -==== -.. doxygennamespace:: KokkosBlas - :project: KokkosKernels - :members: -.. doxygennamespace:: KokkosSparse - :project: KokkosKernels - :members: -.. doxygennamespace:: KokkosBatched - :project: KokkosKernels - :members: \ No newline at end of file + KokkosKernels GitHub Homepage + User Manual + Developer Docs diff --git a/doc/kokkos-promotion.txt b/docs/kokkos-promotion.txt similarity index 100% rename from doc/kokkos-promotion.txt rename to docs/kokkos-promotion.txt diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000..188f51e62d --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +breathe \ No newline at end of file diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index a0c8c1f564..45fb3a41e1 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -7,3 +7,5 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../test_common) #ADD_SUBDIRECTORY(graph) ADD_SUBDIRECTORY(wiki) ADD_SUBDIRECTORY(gmres) +ADD_SUBDIRECTORY(batched_solve) +ADD_SUBDIRECTORY(half) diff --git a/example/batched_solve/CMakeLists.txt b/example/batched_solve/CMakeLists.txt new file mode 100644 index 0000000000..2e3ce96523 --- /dev/null +++ b/example/batched_solve/CMakeLists.txt @@ -0,0 +1,12 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_ADD_EXECUTABLE( + static_pivoting + SOURCES static_pivoting.cpp + ) + +KOKKOSKERNELS_ADD_EXECUTABLE( + team_GMRES + SOURCES team_GMRES.cpp + ) diff --git a/example/batched_solve/examples_helper.hpp b/example/batched_solve/examples_helper.hpp new file mode 100644 index 0000000000..41b936a35c --- /dev/null +++ b/example/batched_solve/examples_helper.hpp @@ -0,0 +1,236 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +/// \brief create_saddle_point_matrices: +/// +/// This function creates the matrices and the rhs of a batched saddle point +/// systems where A and Y (the right hand side) are as follows: +/// +/// ___________ +/// | | T | +/// | B | C | +/// A = |-----+-----| +/// | C | 0 | +/// |_____|_____| +/// +/// _____ +/// | | +/// | D | +/// Y = |-----| +/// | 0 | +/// |_____| +/// +/// with A in R^{n \times n}, B in R^{(n-n_2) \times (n-n_2)} and +/// where B and C are computed as follows: +/// +/// 1. A sequence of n-n_2 points of R^{n_dim} is generated randomly: +/// x^(0), ..., x^(n-n_2-1) +/// 2. Given this sequence, the entries are computed as follows: +/// B_{(i,j)} = \| x^(i) - x^(j)\| +/// C_{(0,j)} = 1 +/// C_{(i,j)} = (x^(j))_{(i-1)} for i != 0 +/// +/// 3. D is generated randomly. +/// +/// This function uses a different sequence of x and a different D for every +/// systems within the batched system. +/// +/// As a consequence of its definitation, the diagonal of A is 0 for every +/// entries. +/// +/// \tparam MatrixViewType: type of the batched matrices +/// \tparam VectorViewType: type of the batched vectors +/// +/// \param A [in/out]: a rank 3 view that has to be prealocated that will store +/// the entries of the batched matrix. \param Y [in/out]: a rank 2 view that has +/// to be prealocated that will store the entries of the right hand side. \param +/// n_dim [in]: the dimension of the physical space where the points are +/// randomly generated (default = 3). +/// + +template +void create_saddle_point_matrices(const MatrixViewType &A, + const VectorViewType &Y, + const int n_dim = 3) { + Kokkos::Random_XorShift64_Pool< + typename MatrixViewType::device_type::execution_space> + random(13718); + const int N = A.extent(0); + const int n = A.extent(1); + const int n_2 = n_dim + 1; + const int n_1 = n - n_2; + + MatrixViewType xs("xs", N, n_1, n_dim); + VectorViewType ys("ys", N, n_1); + + Kokkos::fill_random( + xs, random, + Kokkos::reduction_identity::prod()); + Kokkos::fill_random( + ys, random, + Kokkos::reduction_identity::prod()); + + auto xs_host = Kokkos::create_mirror_view(xs); + auto ys_host = Kokkos::create_mirror_view(ys); + auto A_host = Kokkos::create_mirror_view(A); + auto Y_host = Kokkos::create_mirror_view(Y); + + Kokkos::deep_copy(xs_host, xs); + Kokkos::deep_copy(ys_host, ys); + + for (int i = 0; i < n_1; ++i) { + for (int j = 0; j < n_1; ++j) { + for (int l = 0; l < N; ++l) { + auto xs_i = Kokkos::subview(xs_host, l, i, Kokkos::ALL); + auto xs_j = Kokkos::subview(xs_host, l, j, Kokkos::ALL); + typename MatrixViewType::value_type d = 0; + for (int k = 0; k < n_dim; ++k) d += Kokkos::pow(xs_i(k) - xs_j(k), 2); + d = Kokkos::sqrt(d); + A_host(l, i, j) = Kokkos::pow(d, 5); + } + } + for (int l = 0; l < N; ++l) { + A_host(l, i, n_1) = (typename MatrixViewType::value_type)1.0; + A_host(l, n_1, i) = (typename MatrixViewType::value_type)1.0; + for (int k = 0; k < n_dim; ++k) { + A_host(l, i, n_1 + k + 1) = xs_host(l, i, k); + A_host(l, n_1 + k + 1, i) = xs_host(l, i, k); + } + Y_host(l, i) = ys_host(l, i); + } + } + for (int i = n_1; i < n; ++i) { + for (int l = 0; l < N; ++l) { + Y_host(l, i) = (typename MatrixViewType::value_type)0.0; + } + } + + Kokkos::deep_copy(A, A_host); + Kokkos::deep_copy(Y, Y_host); + + Kokkos::fence(); +} + +template +void create_tridiagonal_batched_matrices(const int nnz, const int BlkSize, + const int N, const IntView &r, + const IntView &c, + const VectorViewType &D, + const VectorViewType &X, + const VectorViewType &B) { + Kokkos::Random_XorShift64_Pool< + typename VectorViewType::device_type::execution_space> + random(13718); + Kokkos::fill_random( + X, random, + Kokkos::reduction_identity::prod()); + Kokkos::fill_random( + B, random, + Kokkos::reduction_identity::prod()); + + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + r_host(0) = 0; + + int current_col = 0; + + for (int i = 0; i < BlkSize; ++i) { + r_host(i + 1) = r_host(i) + (i == 0 || i == (BlkSize - 1) ? 2 : 3); + } + for (int i = 0; i < nnz; ++i) { + if (i % 3 == 0) { + for (int l = 0; l < N; ++l) { + D_host(l, i) = typename VectorViewType::value_type(2.0); + } + c_host(i) = current_col; + ++current_col; + } else { + for (int l = 0; l < N; ++l) { + D_host(l, i) = typename VectorViewType::value_type(-1.0); + } + c_host(i) = current_col; + if (i % 3 == 1) + --current_col; + else + ++current_col; + } + } + + Kokkos::fence(); + + Kokkos::deep_copy(D, D_host); + Kokkos::deep_copy(r, r_host); + Kokkos::deep_copy(c, c_host); + + Kokkos::fence(); +} + +template +void getInvDiagFromCRS(const VType &V, const IntType &r, const IntType &c, + const VType &diag) { + auto diag_values_host = Kokkos::create_mirror_view(diag); + auto values_host = Kokkos::create_mirror_view(V); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, V); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + int N = diag.extent(0); + int BlkSize = diag.extent(1); + + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) { + diag_values_host(j, i) = 1. / values_host(j, current_index); + } + } + + Kokkos::deep_copy(diag, diag_values_host); +} \ No newline at end of file diff --git a/example/batched_solve/static_pivoting.cpp b/example/batched_solve/static_pivoting.cpp new file mode 100644 index 0000000000..69ab25b62f --- /dev/null +++ b/example/batched_solve/static_pivoting.cpp @@ -0,0 +1,182 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include + +#define KOKKOSKERNELS_DEBUG_LEVEL 0 + +#include "Kokkos_Core.hpp" +#include "Kokkos_Timer.hpp" +#include "Kokkos_Random.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include "Kokkos_Sort.hpp" + +/// KokkosKernels headers +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" +#include "KokkosKernels_IOUtils.hpp" + +#include +#include +#include "examples_helper.hpp" +#include +#include +#include +#include +#include +#include +#include "KokkosBatched_Gesv.hpp" + +typedef Kokkos::DefaultExecutionSpace exec_space; + +template +struct Functor_TeamTestStaticPivoting { + const AViewType _A; + const XYViewType _X; + const XYViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_TeamTestStaticPivoting(const AViewType &A, const XYViewType &X, + const XYViewType &Y) + : _A(A), _X(X), _Y(Y) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); + member.team_barrier(); + KokkosBatched::TeamGesv::invoke(member, + A, X, + Y); + member.team_barrier(); + } + + inline void run() { + std::string name("KokkosBatched::Test::StaticPivoting"); + Kokkos::TeamPolicy policy(_A.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + } +}; + +template +struct Functor_SerialTestStaticPivoting { + const AViewType _A; + const AViewType _tmp; + const XYViewType _X; + const XYViewType _Y; + + KOKKOS_INLINE_FUNCTION + Functor_SerialTestStaticPivoting(const AViewType &A, const AViewType &tmp, + const XYViewType &X, const XYViewType &Y) + : _A(A), _tmp(tmp), _X(X), _Y(Y) {} + + KOKKOS_INLINE_FUNCTION void operator()(const int &matrix_id) const { + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto tmp = Kokkos::subview(_tmp, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto X = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto Y = Kokkos::subview(_Y, matrix_id, Kokkos::ALL); + KokkosBatched::SerialGesv::invoke( + A, X, Y, tmp); + } + + inline void run() { + std::string name("KokkosBatched::Test::StaticPivoting"); + + const int N = _A.extent(0); + Kokkos::parallel_for(name.c_str(), N, *this); + } +}; + +int main(int /*argc*/, char ** /*argv[]*/) { + Kokkos::initialize(); + { + using layout = Kokkos::LayoutLeft; + + using AViewType = Kokkos::View; + using XYViewType = Kokkos::View; + + int N = 1; + int n = 10; + + AViewType A("A", N, n, n); + AViewType tmp("tmp", N, n, n + 4); + XYViewType X("X", N, n); + XYViewType Y("Y", N, n); + + create_saddle_point_matrices(A, Y); + + // The matrices are modified by the GESV so we have to copy them if we want + // to solve the same systems twice. + AViewType A2("A2", N, n, n); + XYViewType Y2("Y2", N, n); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(Y2, Y); + + KokkosKernels::Impl::kk_write_3Dview_to_file(A, "A.txt"); + KokkosKernels::Impl::kk_write_2Dview_to_file(Y, "Y.txt"); + + Functor_SerialTestStaticPivoting(A, tmp, + X, Y) + .run(); + KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_serial.txt"); + Functor_TeamTestStaticPivoting(A2, X, Y2) + .run(); + KokkosKernels::Impl::kk_write_2Dview_to_file(X, "X_team.txt"); + } + Kokkos::finalize(); +} diff --git a/example/batched_solve/team_GMRES.cpp b/example/batched_solve/team_GMRES.cpp new file mode 100644 index 0000000000..404e573491 --- /dev/null +++ b/example/batched_solve/team_GMRES.cpp @@ -0,0 +1,328 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include + +#define KOKKOSKERNELS_DEBUG_LEVEL 0 + +#include "Kokkos_Core.hpp" +#include "Kokkos_Timer.hpp" +#include "Kokkos_Random.hpp" +#include "Kokkos_UnorderedMap.hpp" +#include "Kokkos_Sort.hpp" + +/// KokkosKernels headers +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" +#include "KokkosKernels_IOUtils.hpp" + +#include +#include +#include "examples_helper.hpp" +#include +#include +#include +#include +#include + +typedef Kokkos::DefaultExecutionSpace exec_space; + +template +struct Functor_TestBatchedTeamVectorGMRES { + const ValuesViewType _values; + const ValuesViewType _diag; + const IntView _r; + const IntView _c; + const VectorViewType _X; + const VectorViewType _B; + const int _team_size, _vector_length; + KrylovHandleType _handle; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGMRES( + const ValuesViewType &values, const IntView &r, const IntView &c, + const VectorViewType &X, const VectorViewType &B, const int team_size, + const int vector_length, KrylovHandleType &handle) + : _values(values), + _r(r), + _c(c), + _X(X), + _B(B), + _team_size(team_size), + _vector_length(vector_length), + _handle(handle) {} + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGMRES( + const ValuesViewType &values, const ValuesViewType &diag, + const IntView &r, const IntView &c, const VectorViewType &X, + const VectorViewType &B, const int team_size, const int vector_length, + KrylovHandleType &handle) + : _values(values), + _diag(diag), + _r(r), + _c(c), + _X(X), + _B(B), + _team_size(team_size), + _vector_length(vector_length), + _handle(handle) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int first_matrix = _handle.first_index(member.league_rank()); + const int last_matrix = _handle.last_index(member.league_rank()); + using TeamVectorCopy1D = + KokkosBatched::TeamVectorCopy; + + auto d = Kokkos::subview( + _values, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + using ScratchPadIntViewType = + Kokkos::View; + using ScratchPadValuesViewType = Kokkos::View< + typename ValuesViewType::non_const_value_type **, + typename ValuesViewType::array_layout, + typename ValuesViewType::execution_space::scratch_memory_space>; + + using Operator = + KokkosBatched::CrsMatrix; + + ScratchPadIntViewType tmp_1D_int(member.team_scratch(0), + _r.extent(0) + _c.extent(0)); + + auto r = + Kokkos::subview(tmp_1D_int, Kokkos::make_pair(0, (int)_r.extent(0))); + auto c = Kokkos::subview( + tmp_1D_int, + Kokkos::make_pair((int)_r.extent(0), (int)tmp_1D_int.extent(0))); + + TeamVectorCopy1D::invoke(member, _r, r); + TeamVectorCopy1D::invoke(member, _c, c); + Operator A(d, r, c); + + if (UsePrec) { + ScratchPadValuesViewType diag( + member.team_scratch(0), last_matrix - first_matrix, _diag.extent(1)); + using PrecOperator = KokkosBatched::JacobiPrec; + + KokkosBatched::TeamVectorCopy::invoke( + member, + Kokkos::subview(_diag, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL), + diag); + PrecOperator P(diag); + P.setComputedInverse(); + + KokkosBatched::TeamVectorGMRES::template invoke< + Operator, VectorViewType, PrecOperator, KrylovHandleType>( + member, A, b, x, P, _handle); + } else { + KokkosBatched::TeamVectorGMRES::template invoke< + Operator, VectorViewType>(member, A, b, x, _handle); + } + } + + inline double run() { + std::string name("KokkosBatched::Test::TeamVectorGMRES"); + Kokkos::Timer timer; + Kokkos::Profiling::pushRegion(name.c_str()); + + Kokkos::TeamPolicy auto_policy(_handle.get_number_of_teams(), + Kokkos::AUTO(), Kokkos::AUTO()); + Kokkos::TeamPolicy tuned_policy(_handle.get_number_of_teams(), + _team_size, _vector_length); + Kokkos::TeamPolicy policy; + + if (_team_size < 1) + policy = auto_policy; + else + policy = tuned_policy; + + int maximum_iteration = _handle.get_max_iteration(); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; + + using ViewType2D = Kokkos::View; + + size_t bytes_1D = + ViewType2D::shmem_size(_handle.get_number_of_systems_per_team(), 1); + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + size_t bytes_2D_1 = ViewType2D::shmem_size( + _handle.get_number_of_systems_per_team(), _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size( + _handle.get_number_of_systems_per_team(), maximum_iteration + 1); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; + + policy.set_scratch_size( + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); + + exec_space().fence(); + timer.reset(); + Kokkos::parallel_for(name.c_str(), policy, *this); + exec_space().fence(); + double sec = timer.seconds(); + + return sec; + } +}; + +int main(int /*argc*/, char ** /*argv*/) { + Kokkos::initialize(); + { + using layout = Kokkos::LayoutLeft; + + using IntView = Kokkos::View; + using AMatrixValueView = Kokkos::View; + using XYType = Kokkos::View; + + std::string name_A = "mat.mm"; + std::string name_B = "rhs.mm"; + + int N, Blk, nnz; + + Blk = 10; + N = 100; + nnz = (Blk - 2) * 3 + 2 * 2; + + IntView rowOffsets("rowOffsets", Blk + 1); + IntView colIndices("colIndices", nnz); + AMatrixValueView values("values", N, nnz); + AMatrixValueView diag("diag", N, Blk); + XYType x("x", N, Blk); + XYType y("y", N, Blk); + + printf("N = %d, Blk = %d, nnz = %d\n", N, Blk, nnz); + + create_tridiagonal_batched_matrices(nnz, Blk, N, rowOffsets, colIndices, + values, x, y); + + // Replace y by ones: + Kokkos::deep_copy(y, 1.); + + // Replace x by zeros: + // Kokkos::deep_copy(x, 0.); + + getInvDiagFromCRS(values, rowOffsets, colIndices, diag); + + using ScalarType = typename AMatrixValueView::non_const_value_type; + using Layout = typename AMatrixValueView::array_layout; + using EXSP = typename AMatrixValueView::execution_space; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KokkosBatched::KrylovHandle; + + const int N_team = 2; + const int n_iterations = 150; + + const int team_size = -1; + const int vector_length = -1; + const double tol = 1e-8; + const int ortho_strategy = 0; + + KrylovHandleType handle(N, N_team, n_iterations, true); + handle.Arnoldi_view = + Scalar3DViewType("", N, n_iterations, Blk + n_iterations + 3); + + handle.set_max_iteration(n_iterations); + handle.set_tolerance(tol); + handle.set_ortho_strategy(ortho_strategy); + handle.set_scratch_pad_level(0); + handle.set_compute_last_residual(true); + + double time = + Functor_TestBatchedTeamVectorGMRES(values, diag, rowOffsets, + colIndices, x, y, team_size, + vector_length, handle) + .run(); + + printf("times = %f secondes\n", time); + + for (int i = 0; i < N; ++i) { + if (handle.is_converged_host(i)) { + std::cout + << "System " << i << " converged in " + << handle.get_iteration_host(i) + << " iterations, the initial absolute norm of the residual was " + << handle.get_norm_host(i, 0) << " and is now " + << handle.get_last_norm_host(i) << std::endl; + } else { + std::cout + << "System " << i << " did not converge in " + << handle.get_max_iteration() + << " iterations, the initial absolute norm of the residual was " + << handle.get_norm_host(i, 0) << " and is now " + << handle.get_last_norm_host(i) << std::endl; + } + } + if (handle.is_converged_host()) + std::cout << "All the systems have converged." << std::endl; + else + std::cout << "There is at least one system that did not converge." + << std::endl; + } + Kokkos::finalize(); +} diff --git a/example/fenl/TestFixture.hpp b/example/fenl/TestFixture.hpp index 165265b881..54b841c4b6 100644 --- a/example/fenl/TestFixture.hpp +++ b/example/fenl/TestFixture.hpp @@ -56,102 +56,101 @@ namespace Kokkos { namespace Example { -template< class Device > -struct FixtureVerifyElemNodeCoord -{ - typedef Device execution_space ; +template +struct FixtureVerifyElemNodeCoord { + typedef Device execution_space; - typedef struct { size_t success , error ; } value_type ; + typedef struct { + size_t success, error; + } value_type; - typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; + typedef Kokkos::Example::BoxElemFixture< + Device, Kokkos::Example::BoxElemPart::ElemLinear> + FixtureType; - FixtureType m_fixture ; + FixtureType m_fixture; KOKKOS_INLINE_FUNCTION - void init( value_type & update ) const { update.success = update.error = 0 ; } + void init(value_type& update) const { update.success = update.error = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile value_type & update , - volatile const value_type & input ) const - { - update.success += input.success ; - update.error += input.error ; - } - + void join(value_type& update, const value_type& input) const { + update.success += input.success; + update.error += input.error; + } KOKKOS_INLINE_FUNCTION - void operator()( size_t ielem , value_type & update ) const - { - unsigned node_coord[ FixtureType::ElemNode ][3] ; - - for ( unsigned i = 0 ; i < FixtureType::ElemNode ; ++i ) { - const unsigned node_id = m_fixture.elem_node(ielem,i); - node_coord[i][0] = m_fixture.node_grid(node_id,0); - node_coord[i][1] = m_fixture.node_grid(node_id,1); - node_coord[i][2] = m_fixture.node_grid(node_id,2); + void operator()(size_t ielem, value_type& update) const { + unsigned node_coord[FixtureType::ElemNode][3]; + + for (unsigned i = 0; i < FixtureType::ElemNode; ++i) { + const unsigned node_id = m_fixture.elem_node(ielem, i); + node_coord[i][0] = m_fixture.node_grid(node_id, 0); + node_coord[i][1] = m_fixture.node_grid(node_id, 1); + node_coord[i][2] = m_fixture.node_grid(node_id, 2); } - int error = 0 ; - for ( unsigned i = 1 ; i < FixtureType::ElemNode ; ++i ) { - if ( node_coord[0][0] + m_fixture.elem_node_local(i,0) != node_coord[i][0] || - node_coord[0][1] + m_fixture.elem_node_local(i,1) != node_coord[i][1] || - node_coord[0][2] + m_fixture.elem_node_local(i,2) != node_coord[i][2] ) { - error = 1 ; + int error = 0; + for (unsigned i = 1; i < FixtureType::ElemNode; ++i) { + if (node_coord[0][0] + m_fixture.elem_node_local(i, 0) != + node_coord[i][0] || + node_coord[0][1] + m_fixture.elem_node_local(i, 1) != + node_coord[i][1] || + node_coord[0][2] + m_fixture.elem_node_local(i, 2) != + node_coord[i][2]) { + error = 1; } } - if ( error ) { - ++update.error ; - } - else { - ++update.success ; + if (error) { + ++update.error; + } else { + ++update.success; } } - FixtureVerifyElemNodeCoord( const FixtureType & f ) : m_fixture(f) {} + FixtureVerifyElemNodeCoord(const FixtureType& f) : m_fixture(f) {} }; +template +void test_fixture() { + typedef Kokkos::Example::BoxElemFixture< + Device, Kokkos::Example::BoxElemPart::ElemLinear> + FixtureType; -template< class Device > -void test_fixture() -{ - typedef Kokkos::Example::BoxElemFixture< Device , Kokkos::Example::BoxElemPart::ElemLinear > FixtureType ; - - const Kokkos::Example::BoxElemPart::Decompose - decompose = Kokkos::Example::BoxElemPart:: DecomposeElem ; // DecomposeElem | DecomposeNode ; - - const unsigned global_size = 256 ; - const unsigned global_nx = 400 ; - const unsigned global_ny = 400 ; - const unsigned global_nz = 400 ; + const Kokkos::Example::BoxElemPart::Decompose decompose = + Kokkos::Example::BoxElemPart::DecomposeElem; // DecomposeElem | + // DecomposeNode ; - for ( unsigned my_rank = 0 ; my_rank < global_size ; ++my_rank ) { + const unsigned global_size = 256; + const unsigned global_nx = 400; + const unsigned global_ny = 400; + const unsigned global_nz = 400; - const FixtureType fixture( decompose , global_size , my_rank , global_nx , global_ny , global_nz ); + for (unsigned my_rank = 0; my_rank < global_size; ++my_rank) { + const FixtureType fixture(decompose, global_size, my_rank, global_nx, + global_ny, global_nz); // Verify grid coordinates of element's nodes - - typename FixtureVerifyElemNodeCoord::value_type result = { 0 , 0 }; - Kokkos::parallel_reduce( fixture.elem_node().extent(0) , FixtureVerifyElemNodeCoord( fixture ) , result ); + typename FixtureVerifyElemNodeCoord::value_type result = {0, 0}; - if ( result.error ) { + Kokkos::parallel_reduce(fixture.elem_node().extent(0), + FixtureVerifyElemNodeCoord(fixture), + result); + + if (result.error) { std::cout << "P[" << my_rank << ":" << global_size << "] Fixture elem_node_coord" << " success(" << result.success << ")" - << " error(" << result.error << ")" - << std::endl ; + << " error(" << result.error << ")" << std::endl; } // Check send/recv alignment - - } } - } /* namespace Example */ } /* namespace Kokkos */ #endif /* #ifndef KOKKOS_EXAMPLE_TESTFIXTURE_HPP */ - diff --git a/example/fenl/fenl_functors.hpp b/example/fenl/fenl_functors.hpp index 01a4e989da..0a489fa1c0 100644 --- a/example/fenl/fenl_functors.hpp +++ b/example/fenl/fenl_functors.hpp @@ -69,44 +69,42 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode > +template class NodeNodeGraph { -public: + public: + typedef typename ElemNodeIdView::execution_space execution_space; + typedef pair key_type; - typedef typename ElemNodeIdView::execution_space execution_space ; - typedef pair key_type ; - - typedef Kokkos::UnorderedMap< key_type, void , execution_space > SetType ; - typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; - typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; + typedef Kokkos::UnorderedMap SetType; + typedef typename CrsGraphType::row_map_type::non_const_type RowMapType; + typedef Kokkos::View UnsignedValue; // Static dimensions of 0 generate compiler warnings or errors. - typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space > - ElemGraphType ; - -private: - - enum PhaseType { FILL_NODE_SET , - SCAN_NODE_COUNT , - FILL_GRAPH_ENTRIES , - SORT_GRAPH_ENTRIES , - FILL_ELEMENT_GRAPH }; - - const unsigned node_count ; - const ElemNodeIdView elem_node_id ; - UnsignedValue row_total ; - RowMapType row_count ; - RowMapType row_map ; - SetType node_node_set ; - PhaseType phase ; + typedef Kokkos::View + ElemGraphType; + + private: + enum PhaseType { + FILL_NODE_SET, + SCAN_NODE_COUNT, + FILL_GRAPH_ENTRIES, + SORT_GRAPH_ENTRIES, + FILL_ELEMENT_GRAPH + }; -public: + const unsigned node_count; + const ElemNodeIdView elem_node_id; + UnsignedValue row_total; + RowMapType row_count; + RowMapType row_map; + SetType node_node_set; + PhaseType phase; - CrsGraphType graph ; - ElemGraphType elem_graph ; + public: + CrsGraphType graph; + ElemGraphType elem_graph; - struct Times - { + struct Times { double ratio; double fill_node_set; double scan_node_count; @@ -115,139 +113,146 @@ class NodeNodeGraph { double fill_element_graph; }; - NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id , - const unsigned arg_node_count, - Times & results - ) - : node_count(arg_node_count) - , elem_node_id( arg_elem_node_id ) - , row_total( "row_total" ) - , row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count") , node_count ) // will deep_copy to 0 inside loop - , row_map( "graph_row_map" , node_count + 1 ) - , node_node_set() - , phase( FILL_NODE_SET ) - , graph() - , elem_graph() - { - //-------------------------------- - // Guess at span required for the map: - - Kokkos::Timer wall_clock ; - - wall_clock.reset(); - phase = FILL_NODE_SET ; - - // upper bound on the span - size_t set_span = (28ull * node_count) / 2; - - { - // Zero the row count to restart the fill - Kokkos::deep_copy( row_count , 0u ); - - node_node_set = SetType( set_span ); - - // May be larger that requested: - set_span = node_node_set.span(); - - Kokkos::parallel_for( "kokkos-kernels/example/fenl: NodeNodeGraph" , elem_node_id.extent(0) , *this ); - } + NodeNodeGraph(const ElemNodeIdView& arg_elem_node_id, + const unsigned arg_node_count, Times& results) + : node_count(arg_node_count), + elem_node_id(arg_elem_node_id), + row_total("row_total"), + row_count(Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_count"), + node_count) // will deep_copy to 0 inside loop + , + row_map("graph_row_map", node_count + 1), + node_node_set(), + phase(FILL_NODE_SET), + graph(), + elem_graph() { + //-------------------------------- + // Guess at span required for the map: + + Kokkos::Timer wall_clock; + + wall_clock.reset(); + phase = FILL_NODE_SET; + + // upper bound on the span + size_t set_span = (28ull * node_count) / 2; - execution_space().fence(); - results.ratio = (double)node_node_set.size() / (double)node_node_set.span(); - results.fill_node_set = wall_clock.seconds(); - //-------------------------------- + { + // Zero the row count to restart the fill + Kokkos::deep_copy(row_count, 0u); - wall_clock.reset(); - phase = SCAN_NODE_COUNT ; + node_node_set = SetType(set_span); - // Exclusive scan of row_count into row_map - // including the final total in the 'node_count + 1' position. - // Zero the 'row_count' values. - Kokkos::parallel_scan( node_count , *this ); + // May be larger that requested: + set_span = node_node_set.span(); - // Zero the row count for the fill: - Kokkos::deep_copy( row_count , 0u ); + Kokkos::parallel_for("kokkos-kernels/example/fenl: NodeNodeGraph", + elem_node_id.extent(0), *this); + } - unsigned graph_entry_count = 0 ; + execution_space().fence(); + results.ratio = (double)node_node_set.size() / (double)node_node_set.span(); + results.fill_node_set = wall_clock.seconds(); + //-------------------------------- - Kokkos::deep_copy( graph_entry_count , row_total ); + wall_clock.reset(); + phase = SCAN_NODE_COUNT; - // Assign graph's row_map and allocate graph's entries - graph.row_map = row_map ; - graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count ); + // Exclusive scan of row_count into row_map + // including the final total in the 'node_count + 1' position. + // Zero the 'row_count' values. + Kokkos::parallel_scan(node_count, *this); - //-------------------------------- - // Fill graph's entries from the (node,node) set. + // Zero the row count for the fill: + Kokkos::deep_copy(row_count, 0u); - execution_space().fence(); - results.scan_node_count = wall_clock.seconds(); + unsigned graph_entry_count = 0; - wall_clock.reset(); - phase = FILL_GRAPH_ENTRIES ; - Kokkos::parallel_for( node_node_set.span() , *this ); + Kokkos::deep_copy(graph_entry_count, row_total); - execution_space().fence(); - results.fill_graph_entries = wall_clock.seconds(); + // Assign graph's row_map and allocate graph's entries + graph.row_map = row_map; + graph.entries = + typename CrsGraphType::entries_type("graph_entries", graph_entry_count); - //-------------------------------- - // Done with the temporary sets and arrays - wall_clock.reset(); - phase = SORT_GRAPH_ENTRIES ; + //-------------------------------- + // Fill graph's entries from the (node,node) set. - row_total = UnsignedValue(); - row_count = RowMapType(); - row_map = RowMapType(); - node_node_set.clear(); + execution_space().fence(); + results.scan_node_count = wall_clock.seconds(); - //-------------------------------- + wall_clock.reset(); + phase = FILL_GRAPH_ENTRIES; + Kokkos::parallel_for(node_node_set.span(), *this); - Kokkos::parallel_for( node_count , *this ); + execution_space().fence(); + results.fill_graph_entries = wall_clock.seconds(); - execution_space().fence(); - results.sort_graph_entries = wall_clock.seconds(); + //-------------------------------- + // Done with the temporary sets and arrays + wall_clock.reset(); + phase = SORT_GRAPH_ENTRIES; - //-------------------------------- - // Element-to-graph mapping: - wall_clock.reset(); - phase = FILL_ELEMENT_GRAPH ; - elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) ); - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); + row_total = UnsignedValue(); + row_count = RowMapType(); + row_map = RowMapType(); + node_node_set.clear(); - execution_space().fence(); - results.fill_element_graph = wall_clock.seconds(); - } + //-------------------------------- + + Kokkos::parallel_for(node_count, *this); + + execution_space().fence(); + results.sort_graph_entries = wall_clock.seconds(); + + //-------------------------------- + // Element-to-graph mapping: + wall_clock.reset(); + phase = FILL_ELEMENT_GRAPH; + elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0)); + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + execution_space().fence(); + results.fill_element_graph = wall_clock.seconds(); + } //------------------------------------ // parallel_for: create map and count row length KOKKOS_INLINE_FUNCTION - void fill_set( const unsigned ielem ) const - { + void fill_set(const unsigned ielem) const { // Loop over element's (row_local_node,col_local_node) pairs: - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { + for (unsigned col_local_node = row_local_node; + col_local_node < elem_node_id.extent(1); ++col_local_node) { + const unsigned col_node = elem_node_id(ielem, col_local_node); - const unsigned col_node = elem_node_id( ielem , col_local_node ); + // If either node is locally owned then insert the pair into the + // unordered map: - // If either node is locally owned then insert the pair into the unordered map: + if (row_node < row_count.extent(0) || col_node < row_count.extent(0)) { + const key_type key = (row_node < col_node) + ? make_pair(row_node, col_node) + : make_pair(col_node, row_node); - if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) { - - const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ; - - const typename SetType::insert_result result = node_node_set.insert( key ); + const typename SetType::insert_result result = + node_node_set.insert(key); // A successfull insert: the first time this pair was added - if ( result.success() ) { - + if (result.success()) { // If row node is owned then increment count - if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); } + if (row_node < row_count.extent(0)) { + atomic_fetch_add(&row_count(row_node), 1); + } - // If column node is owned and not equal to row node then increment count - if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); } + // If column node is owned and not equal to row node then increment + // count + if (col_node < row_count.extent(0) && col_node != row_node) { + atomic_fetch_add(&row_count(col_node), 1); + } } } } @@ -255,114 +260,111 @@ class NodeNodeGraph { } KOKKOS_INLINE_FUNCTION - void fill_graph_entries( const unsigned iset ) const - { - if ( node_node_set.valid_at(iset) ) { + void fill_graph_entries(const unsigned iset) const { + if (node_node_set.valid_at(iset)) { // Add each entry to the graph entries. - const key_type key = node_node_set.key_at(iset) ; - const unsigned row_node = key.first ; - const unsigned col_node = key.second ; + const key_type key = node_node_set.key_at(iset); + const unsigned row_node = key.first; + const unsigned col_node = key.second; - if ( row_node < row_count.extent(0) ) { - const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); - graph.entries( offset ) = col_node ; + if (row_node < row_count.extent(0)) { + const unsigned offset = + graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1); + graph.entries(offset) = col_node; } - if ( col_node < row_count.extent(0) && col_node != row_node ) { - const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 ); - graph.entries( offset ) = row_node ; + if (col_node < row_count.extent(0) && col_node != row_node) { + const unsigned offset = + graph.row_map(col_node) + atomic_fetch_add(&row_count(col_node), 1); + graph.entries(offset) = row_node; } } } KOKKOS_INLINE_FUNCTION - void sort_graph_entries( const unsigned irow ) const - { - const unsigned row_beg = graph.row_map( irow ); - const unsigned row_end = graph.row_map( irow + 1 ); - for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { + void sort_graph_entries(const unsigned irow) const { + const unsigned row_beg = graph.row_map(irow); + const unsigned row_end = graph.row_map(irow + 1); + for (unsigned i = row_beg + 1; i < row_end; ++i) { const unsigned col = graph.entries(i); - unsigned j = i ; - for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) { - graph.entries(j) = graph.entries(j-1); + unsigned j = i; + for (; row_beg < j && col < graph.entries(j - 1); --j) { + graph.entries(j) = graph.entries(j - 1); } - graph.entries(j) = col ; + graph.entries(j) = col; } } KOKKOS_INLINE_FUNCTION - void fill_elem_graph_map( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); - - for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { - - const unsigned col_node = elem_node_id( ielem , col_local_node ); + void fill_elem_graph_map(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - unsigned entry = ~0u ; + for (unsigned col_local_node = 0; col_local_node < elem_node_id.extent(1); + ++col_local_node) { + const unsigned col_node = elem_node_id(ielem, col_local_node); - if ( row_node + 1 < graph.row_map.extent(0) ) { + unsigned entry = ~0u; - const unsigned entry_end = graph.row_map( row_node + 1 ); + if (row_node + 1 < graph.row_map.extent(0)) { + const unsigned entry_end = graph.row_map(row_node + 1); - entry = graph.row_map( row_node ); + entry = graph.row_map(row_node); - for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry ); + for (; entry < entry_end && graph.entries(entry) != col_node; ++entry) + ; - if ( entry == entry_end ) entry = ~0u ; + if (entry == entry_end) entry = ~0u; } - elem_graph( ielem , row_local_node , col_local_node ) = entry ; + elem_graph(ielem, row_local_node, col_local_node) = entry; } } } KOKKOS_INLINE_FUNCTION - void operator()( const unsigned iwork ) const - { - if ( phase == FILL_NODE_SET ) { - fill_set( iwork ); - } - else if ( phase == FILL_GRAPH_ENTRIES ) { - fill_graph_entries( iwork ); - } - else if ( phase == SORT_GRAPH_ENTRIES ) { - sort_graph_entries( iwork ); - } - else if ( phase == FILL_ELEMENT_GRAPH ) { - fill_elem_graph_map( iwork ); + void operator()(const unsigned iwork) const { + if (phase == FILL_NODE_SET) { + fill_set(iwork); + } else if (phase == FILL_GRAPH_ENTRIES) { + fill_graph_entries(iwork); + } else if (phase == SORT_GRAPH_ENTRIES) { + sort_graph_entries(iwork); + } else if (phase == FILL_ELEMENT_GRAPH) { + fill_elem_graph_map(iwork); } } //------------------------------------ // parallel_scan: row offsets - typedef unsigned value_type ; + typedef unsigned value_type; KOKKOS_INLINE_FUNCTION - void operator()( const unsigned irow , unsigned & update , const bool final ) const - { + void operator()(const unsigned irow, unsigned& update, + const bool final) const { // exclusive scan - if ( final ) { row_map( irow ) = update ; } + if (final) { + row_map(irow) = update; + } - update += row_count( irow ); + update += row_count(irow); - if ( final ) { - if ( irow + 1 == row_count.extent(0) ) { - row_map( irow + 1 ) = update ; - row_total() = update ; + if (final) { + if (irow + 1 == row_count.extent(0)) { + row_map(irow + 1) = update; + row_total() = update; } } } KOKKOS_INLINE_FUNCTION - void init( unsigned & update ) const { update = 0 ; } + void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; } + void join(unsigned& update, const unsigned& input) const { update += input; } //------------------------------------ }; @@ -377,222 +379,210 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class ElemCompType > +template class NodeElemGatherFill { -public: - - typedef typename ElemCompType::execution_space execution_space ; - typedef typename ElemCompType::vector_type vector_type ; - typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type ; - typedef typename ElemCompType::elem_node_type elem_node_type ; - typedef typename ElemCompType::elem_vectors_type elem_vectors_type ; - typedef typename ElemCompType::elem_matrices_type elem_matrices_type ; - typedef typename ElemCompType::elem_graph_type elem_graph_type ; + public: + typedef typename ElemCompType::execution_space execution_space; + typedef typename ElemCompType::vector_type vector_type; + typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type; + typedef typename ElemCompType::elem_node_type elem_node_type; + typedef typename ElemCompType::elem_vectors_type elem_vectors_type; + typedef typename ElemCompType::elem_matrices_type elem_matrices_type; + typedef typename ElemCompType::elem_graph_type elem_graph_type; - static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ; + static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount; //------------------------------------ -private: - - typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space > CrsGraphType ; - typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; - typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; - - enum PhaseType { FILL_NODE_COUNT , - SCAN_NODE_COUNT , - FILL_GRAPH_ENTRIES , - SORT_GRAPH_ENTRIES , - GATHER_FILL }; - - const elem_node_type elem_node_id ; - const elem_graph_type elem_graph ; - UnsignedValue row_total ; - RowMapType row_count ; - RowMapType row_map ; - CrsGraphType graph ; - vector_type residual ; - sparse_matrix_type jacobian ; - elem_vectors_type elem_residual ; - elem_matrices_type elem_jacobian ; - PhaseType phase ; - -public: + private: + typedef Kokkos::StaticCrsGraph CrsGraphType; + typedef typename CrsGraphType::row_map_type::non_const_type RowMapType; + typedef Kokkos::View UnsignedValue; + + enum PhaseType { + FILL_NODE_COUNT, + SCAN_NODE_COUNT, + FILL_GRAPH_ENTRIES, + SORT_GRAPH_ENTRIES, + GATHER_FILL + }; + const elem_node_type elem_node_id; + const elem_graph_type elem_graph; + UnsignedValue row_total; + RowMapType row_count; + RowMapType row_map; + CrsGraphType graph; + vector_type residual; + sparse_matrix_type jacobian; + elem_vectors_type elem_residual; + elem_matrices_type elem_jacobian; + PhaseType phase; + + public: NodeElemGatherFill() - : elem_node_id() - , elem_graph() - , row_total() - , row_count() - , row_map() - , graph() - , residual() - , jacobian() - , elem_residual() - , elem_jacobian() - , phase( FILL_NODE_COUNT ) - {} - - NodeElemGatherFill( const NodeElemGatherFill & rhs ) - : elem_node_id( rhs.elem_node_id ) - , elem_graph( rhs.elem_graph ) - , row_total( rhs.row_total ) - , row_count( rhs.row_count ) - , row_map( rhs.row_map ) - , graph( rhs.graph ) - , residual( rhs.residual ) - , jacobian( rhs.jacobian ) - , elem_residual( rhs.elem_residual ) - , elem_jacobian( rhs.elem_jacobian ) - , phase( rhs.phase ) - {} - - NodeElemGatherFill( const elem_node_type & arg_elem_node_id , - const elem_graph_type & arg_elem_graph , - const vector_type & arg_residual , - const sparse_matrix_type & arg_jacobian , - const elem_vectors_type & arg_elem_residual , - const elem_matrices_type & arg_elem_jacobian ) - : elem_node_id( arg_elem_node_id ) - , elem_graph( arg_elem_graph ) - , row_total( "row_total" ) - , row_count( "row_count" , arg_residual.extent(0) ) - , row_map( "graph_row_map" , arg_residual.extent(0) + 1 ) - , graph() - , residual( arg_residual ) - , jacobian( arg_jacobian ) - , elem_residual( arg_elem_residual ) - , elem_jacobian( arg_elem_jacobian ) - , phase( FILL_NODE_COUNT ) - { - //-------------------------------- - // Count node->element relations - - phase = FILL_NODE_COUNT ; - - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); - - //-------------------------------- - - phase = SCAN_NODE_COUNT ; - - // Exclusive scan of row_count into row_map - // including the final total in the 'node_count + 1' position. - // Zero the 'row_count' values. - Kokkos::parallel_scan( residual.extent(0) , *this ); - - // Zero the row count for the fill: - Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) ); - - unsigned graph_entry_count = 0 ; - - Kokkos::deep_copy( graph_entry_count , row_total ); - - // Assign graph's row_map and allocate graph's entries - graph.row_map = row_map ; - - typedef typename CrsGraphType::entries_type graph_entries_type ; - - graph.entries = graph_entries_type( "graph_entries" , graph_entry_count ); - - //-------------------------------- - // Fill graph's entries from the (node,node) set. - - phase = FILL_GRAPH_ENTRIES ; - - Kokkos::deep_copy( row_count , 0u ); - Kokkos::parallel_for( elem_node_id.extent(0) , *this ); - - execution_space().fence(); - - //-------------------------------- - // Done with the temporary sets and arrays - - row_total = UnsignedValue(); - row_count = RowMapType(); - row_map = RowMapType(); - - //-------------------------------- - - phase = SORT_GRAPH_ENTRIES ; - Kokkos::parallel_for( residual.extent(0) , *this ); - - execution_space().fence(); - - phase = GATHER_FILL ; - } - - void apply() const - { - Kokkos::parallel_for( residual.extent(0) , *this ); + : elem_node_id(), + elem_graph(), + row_total(), + row_count(), + row_map(), + graph(), + residual(), + jacobian(), + elem_residual(), + elem_jacobian(), + phase(FILL_NODE_COUNT) {} + + NodeElemGatherFill(const NodeElemGatherFill& rhs) + : elem_node_id(rhs.elem_node_id), + elem_graph(rhs.elem_graph), + row_total(rhs.row_total), + row_count(rhs.row_count), + row_map(rhs.row_map), + graph(rhs.graph), + residual(rhs.residual), + jacobian(rhs.jacobian), + elem_residual(rhs.elem_residual), + elem_jacobian(rhs.elem_jacobian), + phase(rhs.phase) {} + + NodeElemGatherFill(const elem_node_type& arg_elem_node_id, + const elem_graph_type& arg_elem_graph, + const vector_type& arg_residual, + const sparse_matrix_type& arg_jacobian, + const elem_vectors_type& arg_elem_residual, + const elem_matrices_type& arg_elem_jacobian) + : elem_node_id(arg_elem_node_id), + elem_graph(arg_elem_graph), + row_total("row_total"), + row_count("row_count", arg_residual.extent(0)), + row_map("graph_row_map", arg_residual.extent(0) + 1), + graph(), + residual(arg_residual), + jacobian(arg_jacobian), + elem_residual(arg_elem_residual), + elem_jacobian(arg_elem_jacobian), + phase(FILL_NODE_COUNT) { + //-------------------------------- + // Count node->element relations + + phase = FILL_NODE_COUNT; + + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + //-------------------------------- + + phase = SCAN_NODE_COUNT; + + // Exclusive scan of row_count into row_map + // including the final total in the 'node_count + 1' position. + // Zero the 'row_count' values. + Kokkos::parallel_scan(residual.extent(0), *this); + + // Zero the row count for the fill: + Kokkos::deep_copy(row_count, typename RowMapType::value_type(0)); + + unsigned graph_entry_count = 0; + + Kokkos::deep_copy(graph_entry_count, row_total); + + // Assign graph's row_map and allocate graph's entries + graph.row_map = row_map; + + typedef typename CrsGraphType::entries_type graph_entries_type; + + graph.entries = graph_entries_type("graph_entries", graph_entry_count); + + //-------------------------------- + // Fill graph's entries from the (node,node) set. + + phase = FILL_GRAPH_ENTRIES; + + Kokkos::deep_copy(row_count, 0u); + Kokkos::parallel_for(elem_node_id.extent(0), *this); + + execution_space().fence(); + + //-------------------------------- + // Done with the temporary sets and arrays + + row_total = UnsignedValue(); + row_count = RowMapType(); + row_map = RowMapType(); + + //-------------------------------- + + phase = SORT_GRAPH_ENTRIES; + Kokkos::parallel_for(residual.extent(0), *this); + + execution_space().fence(); + + phase = GATHER_FILL; } + void apply() const { Kokkos::parallel_for(residual.extent(0), *this); } + //------------------------------------ //------------------------------------ // parallel_for: Count node->element pairs KOKKOS_INLINE_FUNCTION - void fill_node_count( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { + void fill_node_count(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - const unsigned row_node = elem_node_id( ielem , row_local_node ); - - if ( row_node < row_count.extent(0) ) { - atomic_fetch_add( & row_count( row_node ) , 1 ); + if (row_node < row_count.extent(0)) { + atomic_fetch_add(&row_count(row_node), 1); } } } KOKKOS_INLINE_FUNCTION - void fill_graph_entries( const unsigned ielem ) const - { - for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { - - const unsigned row_node = elem_node_id( ielem , row_local_node ); - - if ( row_node < row_count.extent(0) ) { + void fill_graph_entries(const unsigned ielem) const { + for (unsigned row_local_node = 0; row_local_node < elem_node_id.extent(1); + ++row_local_node) { + const unsigned row_node = elem_node_id(ielem, row_local_node); - const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); + if (row_node < row_count.extent(0)) { + const unsigned offset = + graph.row_map(row_node) + atomic_fetch_add(&row_count(row_node), 1); - graph.entries( offset , 0 ) = ielem ; - graph.entries( offset , 1 ) = row_local_node ; + graph.entries(offset, 0) = ielem; + graph.entries(offset, 1) = row_local_node; } } } KOKKOS_INLINE_FUNCTION - void sort_graph_entries( const unsigned irow ) const - { - const unsigned row_beg = graph.row_map( irow ); - const unsigned row_end = graph.row_map( irow + 1 ); - for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { - const unsigned elem = graph.entries(i,0); - const unsigned local = graph.entries(i,1); - unsigned j = i ; - for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) { - graph.entries(j,0) = graph.entries(j-1,0); - graph.entries(j,1) = graph.entries(j-1,1); + void sort_graph_entries(const unsigned irow) const { + const unsigned row_beg = graph.row_map(irow); + const unsigned row_end = graph.row_map(irow + 1); + for (unsigned i = row_beg + 1; i < row_end; ++i) { + const unsigned elem = graph.entries(i, 0); + const unsigned local = graph.entries(i, 1); + unsigned j = i; + for (; row_beg < j && elem < graph.entries(j - 1, 0); --j) { + graph.entries(j, 0) = graph.entries(j - 1, 0); + graph.entries(j, 1) = graph.entries(j - 1, 1); } - graph.entries(j,0) = elem ; - graph.entries(j,1) = local ; + graph.entries(j, 0) = elem; + graph.entries(j, 1) = local; } } //------------------------------------ KOKKOS_INLINE_FUNCTION - void gather_fill( const unsigned irow ) const - { + void gather_fill(const unsigned irow) const { const unsigned node_elem_begin = graph.row_map(irow); - const unsigned node_elem_end = graph.row_map(irow+1); + const unsigned node_elem_end = graph.row_map(irow + 1); // for each element that a node belongs to - for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) { - - const unsigned elem_id = graph.entries( i, 0); - const unsigned row_index = graph.entries( i, 1); + for (unsigned i = node_elem_begin; i < node_elem_end; i++) { + const unsigned elem_id = graph.entries(i, 0); + const unsigned row_index = graph.entries(i, 1); residual(irow) += elem_residual(elem_id, row_index); @@ -600,10 +590,10 @@ class NodeElemGatherFill { // gather the contents of the element stiffness // matrix that belong in irow - for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) { - const unsigned A_index = elem_graph( elem_id , row_index , j ); + for (unsigned j = 0; j < ElemNodeCount; ++j) { + const unsigned A_index = elem_graph(elem_id, row_index, j); - jacobian.values( A_index ) += elem_jacobian( elem_id, row_index, j ); + jacobian.values(A_index) += elem_jacobian(elem_id, row_index, j); } } } @@ -611,48 +601,46 @@ class NodeElemGatherFill { //------------------------------------ KOKKOS_INLINE_FUNCTION - void operator()( const unsigned iwork ) const - { - if ( phase == FILL_NODE_COUNT ) { - fill_node_count( iwork ); - } - else if ( phase == FILL_GRAPH_ENTRIES ) { - fill_graph_entries( iwork ); - } - else if ( phase == SORT_GRAPH_ENTRIES ) { - sort_graph_entries( iwork ); - } - else if ( phase == GATHER_FILL ) { - gather_fill( iwork ); + void operator()(const unsigned iwork) const { + if (phase == FILL_NODE_COUNT) { + fill_node_count(iwork); + } else if (phase == FILL_GRAPH_ENTRIES) { + fill_graph_entries(iwork); + } else if (phase == SORT_GRAPH_ENTRIES) { + sort_graph_entries(iwork); + } else if (phase == GATHER_FILL) { + gather_fill(iwork); } } //------------------------------------ // parallel_scan: row offsets - typedef unsigned value_type ; + typedef unsigned value_type; KOKKOS_INLINE_FUNCTION - void operator()( const unsigned irow , unsigned & update , const bool final ) const - { + void operator()(const unsigned irow, unsigned& update, + const bool final) const { // exclusive scan - if ( final ) { row_map( irow ) = update ; } + if (final) { + row_map(irow) = update; + } - update += row_count( irow ); + update += row_count(irow); - if ( final ) { - if ( irow + 1 == row_count.extent(0) ) { - row_map( irow + 1 ) = update ; - row_total() = update ; + if (final) { + if (irow + 1 == row_count.extent(0)) { + row_map(irow + 1) = update; + row_total() = update; } } } KOKKOS_INLINE_FUNCTION - void init( unsigned & update ) const { update = 0 ; } + void init(unsigned& update) const { update = 0; } KOKKOS_INLINE_FUNCTION - void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; } + void join(unsigned& update, const unsigned& input) const { update += input; } }; } /* namespace FENL */ @@ -665,188 +653,191 @@ namespace Kokkos { namespace Example { namespace FENL { -template< class FiniteElementMeshType , class SparseMatrixType > -class ElementComputation ; - +template +class ElementComputation; -template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap , - typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType > +template class ElementComputation< - Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > , - KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > > -{ -public: - - typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap > mesh_type ; - typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode > element_data_type ; - - typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > sparse_matrix_type ; - typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; - - typedef DeviceType execution_space ; - typedef ScalarType scalar_type ; - - static const unsigned SpatialDim = element_data_type::spatial_dimension ; - static const unsigned TensorDim = SpatialDim * SpatialDim ; - static const unsigned ElemNodeCount = element_data_type::element_node_count ; - static const unsigned FunctionCount = element_data_type::function_count ; - static const unsigned IntegrationCount = element_data_type::integration_count ; + Kokkos::Example::BoxElemFixture, + KokkosSparse::CrsMatrix > { + public: + typedef Kokkos::Example::BoxElemFixture + mesh_type; + typedef Kokkos::Example::HexElement_Data + element_data_type; + + typedef KokkosSparse::CrsMatrix + sparse_matrix_type; + typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type; + + typedef DeviceType execution_space; + typedef ScalarType scalar_type; + + static const unsigned SpatialDim = element_data_type::spatial_dimension; + static const unsigned TensorDim = SpatialDim * SpatialDim; + static const unsigned ElemNodeCount = element_data_type::element_node_count; + static const unsigned FunctionCount = element_data_type::function_count; + static const unsigned IntegrationCount = element_data_type::integration_count; //------------------------------------ - typedef typename mesh_type::node_coord_type node_coord_type ; - typedef typename mesh_type::elem_node_type elem_node_type ; - typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ; - typedef Kokkos::View< scalar_type*[FunctionCount] , execution_space > elem_vectors_type ; - typedef Kokkos::View< scalar_type* , execution_space > vector_type ; + typedef typename mesh_type::node_coord_type node_coord_type; + typedef typename mesh_type::elem_node_type elem_node_type; + typedef Kokkos::View + elem_matrices_type; + typedef Kokkos::View + elem_vectors_type; + typedef Kokkos::View vector_type; - typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ; + typedef typename NodeNodeGraph::ElemGraphType elem_graph_type; //------------------------------------ - //------------------------------------ // Computational data: - const element_data_type elem_data ; - const elem_node_type elem_node_ids ; - const node_coord_type node_coords ; - const elem_graph_type elem_graph ; - const elem_matrices_type elem_jacobians ; - const elem_vectors_type elem_residuals ; - const vector_type solution ; - const vector_type residual ; - const sparse_matrix_type jacobian ; - const scalar_type coeff_K ; - - ElementComputation( const ElementComputation & rhs ) - : elem_data() - , elem_node_ids( rhs.elem_node_ids ) - , node_coords( rhs.node_coords ) - , elem_graph( rhs.elem_graph ) - , elem_jacobians( rhs.elem_jacobians ) - , elem_residuals( rhs.elem_residuals ) - , solution( rhs.solution ) - , residual( rhs.residual ) - , jacobian( rhs.jacobian ) - , coeff_K( rhs.coeff_K ) - {} + const element_data_type elem_data; + const elem_node_type elem_node_ids; + const node_coord_type node_coords; + const elem_graph_type elem_graph; + const elem_matrices_type elem_jacobians; + const elem_vectors_type elem_residuals; + const vector_type solution; + const vector_type residual; + const sparse_matrix_type jacobian; + const scalar_type coeff_K; + + ElementComputation(const ElementComputation& rhs) + : elem_data(), + elem_node_ids(rhs.elem_node_ids), + node_coords(rhs.node_coords), + elem_graph(rhs.elem_graph), + elem_jacobians(rhs.elem_jacobians), + elem_residuals(rhs.elem_residuals), + solution(rhs.solution), + residual(rhs.residual), + jacobian(rhs.jacobian), + coeff_K(rhs.coeff_K) {} // If the element->sparse_matrix graph is provided then perform atomic updates - // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian. - ElementComputation( const mesh_type & arg_mesh , - const scalar_type arg_coeff_K , - const vector_type & arg_solution , - const elem_graph_type & arg_elem_graph , - const sparse_matrix_type & arg_jacobian , - const vector_type & arg_residual ) - : elem_data() - , elem_node_ids( arg_mesh.elem_node() ) - , node_coords( arg_mesh.node_coord() ) - , elem_graph( arg_elem_graph ) - , elem_jacobians() - , elem_residuals() - , solution( arg_solution ) - , residual( arg_residual ) - , jacobian( arg_jacobian ) - , coeff_K( arg_coeff_K ) - {} - - ElementComputation( const mesh_type & arg_mesh , - const scalar_type arg_coeff_K , - const vector_type & arg_solution ) - : elem_data() - , elem_node_ids( arg_mesh.elem_node() ) - , node_coords( arg_mesh.node_coord() ) - , elem_graph() - , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() ) - , elem_residuals( "elem_residuals" , arg_mesh.elem_count() ) - , solution( arg_solution ) - , residual() - , jacobian() - , coeff_K( arg_coeff_K ) - {} + // Otherwise fill per-element contributions for subequent gather-add into a + // residual and jacobian. + ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K, + const vector_type& arg_solution, + const elem_graph_type& arg_elem_graph, + const sparse_matrix_type& arg_jacobian, + const vector_type& arg_residual) + : elem_data(), + elem_node_ids(arg_mesh.elem_node()), + node_coords(arg_mesh.node_coord()), + elem_graph(arg_elem_graph), + elem_jacobians(), + elem_residuals(), + solution(arg_solution), + residual(arg_residual), + jacobian(arg_jacobian), + coeff_K(arg_coeff_K) {} + + ElementComputation(const mesh_type& arg_mesh, const scalar_type arg_coeff_K, + const vector_type& arg_solution) + : elem_data(), + elem_node_ids(arg_mesh.elem_node()), + node_coords(arg_mesh.node_coord()), + elem_graph(), + elem_jacobians("elem_jacobians", arg_mesh.elem_count()), + elem_residuals("elem_residuals", arg_mesh.elem_count()), + solution(arg_solution), + residual(), + jacobian(), + coeff_K(arg_coeff_K) {} //------------------------------------ - void apply() const - { - parallel_for( elem_node_ids.extent(0) , *this ); - } + void apply() const { parallel_for(elem_node_ids.extent(0), *this); } //------------------------------------ static const unsigned FLOPS_transform_gradients = - /* Jacobian */ FunctionCount * TensorDim * 2 + - /* Inverse jacobian */ TensorDim * 6 + 6 + - /* Gradient transform */ FunctionCount * 15 ; + /* Jacobian */ FunctionCount * TensorDim * 2 + + /* Inverse jacobian */ TensorDim * 6 + 6 + + /* Gradient transform */ FunctionCount * 15; KOKKOS_INLINE_FUNCTION float transform_gradients( - const float grad[][ FunctionCount ] , // Gradient of bases master element - const double x[] , - const double y[] , - const double z[] , - float dpsidx[] , - float dpsidy[] , - float dpsidz[] ) const - { - enum { j11 = 0 , j12 = 1 , j13 = 2 , - j21 = 3 , j22 = 4 , j23 = 5 , - j31 = 6 , j32 = 7 , j33 = 8 }; + const float grad[][FunctionCount], // Gradient of bases master element + const double x[], const double y[], const double z[], float dpsidx[], + float dpsidy[], float dpsidz[]) const { + enum { + j11 = 0, + j12 = 1, + j13 = 2, + j21 = 3, + j22 = 4, + j23 = 5, + j31 = 6, + j32 = 7, + j33 = 8 + }; // Jacobian accumulation: - double J[ TensorDim ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + double J[TensorDim] = {0, 0, 0, 0, 0, 0, 0, 0, 0}; - for( unsigned i = 0; i < FunctionCount ; ++i ) { - const double x1 = x[i] ; - const double x2 = y[i] ; - const double x3 = z[i] ; + for (unsigned i = 0; i < FunctionCount; ++i) { + const double x1 = x[i]; + const double x2 = y[i]; + const double x3 = z[i]; - const float g1 = grad[0][i] ; - const float g2 = grad[1][i] ; - const float g3 = grad[2][i] ; + const float g1 = grad[0][i]; + const float g2 = grad[1][i]; + const float g3 = grad[2][i]; - J[j11] += g1 * x1 ; - J[j12] += g1 * x2 ; - J[j13] += g1 * x3 ; + J[j11] += g1 * x1; + J[j12] += g1 * x2; + J[j13] += g1 * x3; - J[j21] += g2 * x1 ; - J[j22] += g2 * x2 ; - J[j23] += g2 * x3 ; + J[j21] += g2 * x1; + J[j22] += g2 * x2; + J[j23] += g2 * x3; - J[j31] += g3 * x1 ; - J[j32] += g3 * x2 ; - J[j33] += g3 * x3 ; + J[j31] += g3 * x1; + J[j32] += g3 * x2; + J[j33] += g3 * x3; } // Inverse jacobian: - float invJ[ TensorDim ] = { - static_cast( J[j22] * J[j33] - J[j23] * J[j32] ) , - static_cast( J[j13] * J[j32] - J[j12] * J[j33] ) , - static_cast( J[j12] * J[j23] - J[j13] * J[j22] ) , + float invJ[TensorDim] = { + static_cast(J[j22] * J[j33] - J[j23] * J[j32]), + static_cast(J[j13] * J[j32] - J[j12] * J[j33]), + static_cast(J[j12] * J[j23] - J[j13] * J[j22]), - static_cast( J[j23] * J[j31] - J[j21] * J[j33] ) , - static_cast( J[j11] * J[j33] - J[j13] * J[j31] ) , - static_cast( J[j13] * J[j21] - J[j11] * J[j23] ) , + static_cast(J[j23] * J[j31] - J[j21] * J[j33]), + static_cast(J[j11] * J[j33] - J[j13] * J[j31]), + static_cast(J[j13] * J[j21] - J[j11] * J[j23]), - static_cast( J[j21] * J[j32] - J[j22] * J[j31] ) , - static_cast( J[j12] * J[j31] - J[j11] * J[j32] ) , - static_cast( J[j11] * J[j22] - J[j12] * J[j21] ) }; + static_cast(J[j21] * J[j32] - J[j22] * J[j31]), + static_cast(J[j12] * J[j31] - J[j11] * J[j32]), + static_cast(J[j11] * J[j22] - J[j12] * J[j21])}; - const float detJ = J[j11] * invJ[j11] + - J[j21] * invJ[j12] + - J[j31] * invJ[j13] ; + const float detJ = + J[j11] * invJ[j11] + J[j21] * invJ[j12] + J[j31] * invJ[j13]; - const float detJinv = 1.0 / detJ ; + const float detJinv = 1.0 / detJ; - for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; } + for (unsigned i = 0; i < TensorDim; ++i) { + invJ[i] *= detJinv; + } // Transform gradients: - for( unsigned i = 0; i < FunctionCount ; ++i ) { + for (unsigned i = 0; i < FunctionCount; ++i) { const float g0 = grad[0][i]; const float g1 = grad[1][i]; const float g2 = grad[2][i]; @@ -856,113 +847,101 @@ class ElementComputation< dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33]; } - return detJ ; + return detJ; } KOKKOS_INLINE_FUNCTION - void contributeResidualJacobian( - const float coeff_k , - const double dof_values[] , - const float dpsidx[] , - const float dpsidy[] , - const float dpsidz[] , - const float detJ , - const float integ_weight , - const float bases_vals[] , - double elem_res[] , - double elem_mat[][ FunctionCount ] ) const - { - double value_at_pt = 0 ; - double gradx_at_pt = 0 ; - double grady_at_pt = 0 ; - double gradz_at_pt = 0 ; - - for ( unsigned m = 0 ; m < FunctionCount ; m++ ) { - value_at_pt += dof_values[m] * bases_vals[m] ; - gradx_at_pt += dof_values[m] * dpsidx[m] ; - grady_at_pt += dof_values[m] * dpsidy[m] ; - gradz_at_pt += dof_values[m] * dpsidz[m] ; + void contributeResidualJacobian(const float coeff_k, + const double dof_values[], + const float dpsidx[], const float dpsidy[], + const float dpsidz[], const float detJ, + const float integ_weight, + const float bases_vals[], double elem_res[], + double elem_mat[][FunctionCount]) const { + double value_at_pt = 0; + double gradx_at_pt = 0; + double grady_at_pt = 0; + double gradz_at_pt = 0; + + for (unsigned m = 0; m < FunctionCount; m++) { + value_at_pt += dof_values[m] * bases_vals[m]; + gradx_at_pt += dof_values[m] * dpsidx[m]; + grady_at_pt += dof_values[m] * dpsidy[m]; + gradz_at_pt += dof_values[m] * dpsidz[m]; } - const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight ; - const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ; - const double mat_val = 2.0 * value_at_pt * detJ * integ_weight ; + const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight; + const double res_val = value_at_pt * value_at_pt * detJ * integ_weight; + const double mat_val = 2.0 * value_at_pt * detJ * integ_weight; - // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ - // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ + // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d + // \Omega $$ + // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla + // \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ - for ( unsigned m = 0; m < FunctionCount; ++m) { - double * const mat = elem_mat[m] ; + for (unsigned m = 0; m < FunctionCount; ++m) { + double* const mat = elem_mat[m]; const float bases_val_m = bases_vals[m]; - const float dpsidx_m = dpsidx[m] ; - const float dpsidy_m = dpsidy[m] ; - const float dpsidz_m = dpsidz[m] ; - - elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt + - dpsidy_m * grady_at_pt + - dpsidz_m * gradz_at_pt ) + - res_val * bases_val_m ; - - for( unsigned n = 0; n < FunctionCount; n++) { - - mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] + - dpsidy_m * dpsidy[n] + - dpsidz_m * dpsidz[n] ) + + const float dpsidx_m = dpsidx[m]; + const float dpsidy_m = dpsidy[m]; + const float dpsidz_m = dpsidz[m]; + + elem_res[m] += + k_detJ_weight * (dpsidx_m * gradx_at_pt + dpsidy_m * grady_at_pt + + dpsidz_m * gradz_at_pt) + + res_val * bases_val_m; + + for (unsigned n = 0; n < FunctionCount; n++) { + mat[n] += k_detJ_weight * (dpsidx_m * dpsidx[n] + dpsidy_m * dpsidy[n] + + dpsidz_m * dpsidz[n]) + mat_val * bases_val_m * bases_vals[n]; } } } KOKKOS_INLINE_FUNCTION - void operator()( const unsigned ielem ) const - { + void operator()(const unsigned ielem) const { // Gather nodal coordinates and solution vector: - double x[ FunctionCount ] ; - double y[ FunctionCount ] ; - double z[ FunctionCount ] ; - double val[ FunctionCount ] ; - unsigned node_index[ ElemNodeCount ]; + double x[FunctionCount]; + double y[FunctionCount]; + double z[FunctionCount]; + double val[FunctionCount]; + unsigned node_index[ElemNodeCount]; - for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) { - const unsigned ni = elem_node_ids( ielem , i ); + for (unsigned i = 0; i < ElemNodeCount; ++i) { + const unsigned ni = elem_node_ids(ielem, i); - node_index[i] = ni ; + node_index[i] = ni; - x[i] = node_coords( ni , 0 ); - y[i] = node_coords( ni , 1 ); - z[i] = node_coords( ni , 2 ); + x[i] = node_coords(ni, 0); + y[i] = node_coords(ni, 1); + z[i] = node_coords(ni, 2); - val[i] = solution( ni ); + val[i] = solution(ni); } + double elem_vec[FunctionCount]; + double elem_mat[FunctionCount][FunctionCount]; - double elem_vec[ FunctionCount ] ; - double elem_mat[ FunctionCount ][ FunctionCount ] ; - - for( unsigned i = 0; i < FunctionCount ; i++ ) { - elem_vec[i] = 0 ; - for( unsigned j = 0; j < FunctionCount ; j++){ - elem_mat[i][j] = 0 ; + for (unsigned i = 0; i < FunctionCount; i++) { + elem_vec[i] = 0; + for (unsigned j = 0; j < FunctionCount; j++) { + elem_mat[i][j] = 0; } } + for (unsigned i = 0; i < IntegrationCount; ++i) { + float dpsidx[FunctionCount]; + float dpsidy[FunctionCount]; + float dpsidz[FunctionCount]; - for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) { - float dpsidx[ FunctionCount ] ; - float dpsidy[ FunctionCount ] ; - float dpsidz[ FunctionCount ] ; + const float detJ = transform_gradients(elem_data.gradients[i], x, y, z, + dpsidx, dpsidy, dpsidz); - const float detJ = - transform_gradients( elem_data.gradients[i] , x , y , z , - dpsidx , dpsidy , dpsidz ); - - contributeResidualJacobian( coeff_K , - val , dpsidx , dpsidy , dpsidz , - detJ , - elem_data.weights[i] , - elem_data.values[i] , - elem_vec , elem_mat ); + contributeResidualJacobian(coeff_K, val, dpsidx, dpsidy, dpsidz, detJ, + elem_data.weights[i], elem_data.values[i], + elem_vec, elem_mat); } #if 0 @@ -984,24 +963,23 @@ if ( 1 == ielem ) { #endif - if ( ! residual.extent(0) ) { - for( unsigned i = 0; i < FunctionCount ; i++){ - elem_residuals(ielem, i) = elem_vec[i] ; - for( unsigned j = 0; j < FunctionCount ; j++){ - elem_jacobians(ielem, i, j) = elem_mat[i][j] ; + if (!residual.extent(0)) { + for (unsigned i = 0; i < FunctionCount; i++) { + elem_residuals(ielem, i) = elem_vec[i]; + for (unsigned j = 0; j < FunctionCount; j++) { + elem_jacobians(ielem, i, j) = elem_mat[i][j]; } } - } - else { - for( unsigned i = 0 ; i < FunctionCount ; i++ ) { - const unsigned row = node_index[i] ; - if ( row < residual.extent(0) ) { - atomic_fetch_add( & residual( row ) , elem_vec[i] ); - - for( unsigned j = 0 ; j < FunctionCount ; j++ ) { - const unsigned entry = elem_graph( ielem , i , j ); - if ( entry != ~0u ) { - atomic_fetch_add( & jacobian.values( entry ) , elem_mat[i][j] ); + } else { + for (unsigned i = 0; i < FunctionCount; i++) { + const unsigned row = node_index[i]; + if (row < residual.extent(0)) { + atomic_fetch_add(&residual(row), elem_vec[i]); + + for (unsigned j = 0; j < FunctionCount; j++) { + const unsigned entry = elem_graph(ielem, i, j); + if (entry != ~0u) { + atomic_fetch_add(&jacobian.values(entry), elem_mat[i][j]); } } } @@ -1012,119 +990,114 @@ if ( 1 == ielem ) { //---------------------------------------------------------------------------- -template< class FixtureType , class SparseMatrixType > -class DirichletComputation ; +template +class DirichletComputation; -template< class DeviceType , BoxElemPart::ElemOrder Order , class CoordinateMap , - typename ScalarType , typename OrdinalType , class MemoryTraits , typename SizeType > +template class DirichletComputation< - Kokkos::Example::BoxElemFixture< DeviceType , Order , CoordinateMap > , - KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > > -{ -public: - - typedef Kokkos::Example::BoxElemFixture< DeviceType, Order, CoordinateMap > mesh_type ; - typedef typename mesh_type::node_coord_type node_coord_type ; - typedef typename node_coord_type::value_type scalar_coord_type ; - - typedef KokkosSparse::CrsMatrix< ScalarType , OrdinalType , DeviceType , MemoryTraits , SizeType > sparse_matrix_type ; - typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; - - typedef DeviceType execution_space ; - typedef ScalarType scalar_type ; + Kokkos::Example::BoxElemFixture, + KokkosSparse::CrsMatrix > { + public: + typedef Kokkos::Example::BoxElemFixture + mesh_type; + typedef typename mesh_type::node_coord_type node_coord_type; + typedef typename node_coord_type::value_type scalar_coord_type; + + typedef KokkosSparse::CrsMatrix + sparse_matrix_type; + typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type; + + typedef DeviceType execution_space; + typedef ScalarType scalar_type; //------------------------------------ - typedef Kokkos::View< scalar_type* , execution_space > vector_type ; + typedef Kokkos::View vector_type; //------------------------------------ // Computational data: - const node_coord_type node_coords ; - const vector_type solution ; - const sparse_matrix_type jacobian ; - const vector_type residual ; - const scalar_type bc_lower_value ; - const scalar_type bc_upper_value ; - const scalar_coord_type bc_lower_limit ; - const scalar_coord_type bc_upper_limit ; - const unsigned bc_plane ; - const unsigned node_count ; - bool init ; - - - DirichletComputation( const mesh_type & arg_mesh , - const vector_type & arg_solution , - const sparse_matrix_type & arg_jacobian , - const vector_type & arg_residual , - const unsigned arg_bc_plane , - const scalar_type arg_bc_lower_value , - const scalar_type arg_bc_upper_value ) - : node_coords( arg_mesh.node_coord() ) - , solution( arg_solution ) - , jacobian( arg_jacobian ) - , residual( arg_residual ) - , bc_lower_value( arg_bc_lower_value ) - , bc_upper_value( arg_bc_upper_value ) - , bc_lower_limit( std::numeric_limits::epsilon() ) - , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits::epsilon() ) - , bc_plane( arg_bc_plane ) - , node_count( arg_mesh.node_count_owned() ) - , init( false ) - { - parallel_for( node_count , *this ); - init = true ; - } - - void apply() const - { - parallel_for( node_count , *this ); + const node_coord_type node_coords; + const vector_type solution; + const sparse_matrix_type jacobian; + const vector_type residual; + const scalar_type bc_lower_value; + const scalar_type bc_upper_value; + const scalar_coord_type bc_lower_limit; + const scalar_coord_type bc_upper_limit; + const unsigned bc_plane; + const unsigned node_count; + bool init; + + DirichletComputation(const mesh_type& arg_mesh, + const vector_type& arg_solution, + const sparse_matrix_type& arg_jacobian, + const vector_type& arg_residual, + const unsigned arg_bc_plane, + const scalar_type arg_bc_lower_value, + const scalar_type arg_bc_upper_value) + : node_coords(arg_mesh.node_coord()), + solution(arg_solution), + jacobian(arg_jacobian), + residual(arg_residual), + bc_lower_value(arg_bc_lower_value), + bc_upper_value(arg_bc_upper_value), + bc_lower_limit(std::numeric_limits::epsilon()), + bc_upper_limit(scalar_coord_type(1) - + std::numeric_limits::epsilon()), + bc_plane(arg_bc_plane), + node_count(arg_mesh.node_count_owned()), + init(false) { + parallel_for(node_count, *this); + init = true; } + void apply() const { parallel_for(node_count, *this); } + //------------------------------------ KOKKOS_INLINE_FUNCTION - void operator()( const unsigned inode ) const - { + void operator()(const unsigned inode) const { // Apply dirichlet boundary condition on the Solution and Residual vectors. // To maintain the symmetry of the original global stiffness matrix, // zero out the columns that correspond to boundary conditions, and // update the residual vector accordingly const unsigned iBeg = jacobian.graph.row_map[inode]; - const unsigned iEnd = jacobian.graph.row_map[inode+1]; + const unsigned iEnd = jacobian.graph.row_map[inode + 1]; - const scalar_coord_type c = node_coords(inode,bc_plane); - const bool bc_lower = c <= bc_lower_limit ; - const bool bc_upper = bc_upper_limit <= c ; + const scalar_coord_type c = node_coords(inode, bc_plane); + const bool bc_lower = c <= bc_lower_limit; + const bool bc_upper = bc_upper_limit <= c; - if ( ! init ) { - solution(inode) = bc_lower ? bc_lower_value : ( - bc_upper ? bc_upper_value : 0 ); - } - else { - if ( bc_lower || bc_upper ) { - - residual(inode) = 0 ; + if (!init) { + solution(inode) = + bc_lower ? bc_lower_value : (bc_upper ? bc_upper_value : 0); + } else { + if (bc_lower || bc_upper) { + residual(inode) = 0; // zero each value on the row, and leave a one // on the diagonal - for( unsigned i = iBeg ; i < iEnd ; ++i ) { - jacobian.values(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ; + for (unsigned i = iBeg; i < iEnd; ++i) { + jacobian.values(i) = + int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0; } - } - else { - + } else { // Find any columns that are boundary conditions. // Clear them and adjust the residual vector - for( unsigned i = iBeg ; i < iEnd ; ++i ) { - const unsigned cnode = jacobian.graph.entries(i) ; - const scalar_coord_type cc = node_coords(cnode,bc_plane); + for (unsigned i = iBeg; i < iEnd; ++i) { + const unsigned cnode = jacobian.graph.entries(i); + const scalar_coord_type cc = node_coords(cnode, bc_plane); - if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) { - jacobian.values(i) = 0 ; + if ((cc <= bc_lower_limit) || (bc_upper_limit <= cc)) { + jacobian.values(i) = 0; } } } @@ -1139,11 +1112,10 @@ class DirichletComputation< //---------------------------------------------------------------------------- /* A Cuda-specific specialization for the element computation functor. */ -#if defined( __CUDACC__ ) +#if defined(__CUDACC__) // #include #endif //---------------------------------------------------------------------------- #endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */ - diff --git a/example/gmres/ex_real_A.cpp b/example/gmres/ex_real_A.cpp index 2c119d2a9c..b3e95605f7 100644 --- a/example/gmres/ex_real_A.cpp +++ b/example/gmres/ex_real_A.cpp @@ -42,31 +42,31 @@ //@HEADER */ -#include -#include"KokkosKernels_IOUtils.hpp" -#include -#include -#include -#include -#include +#include +#include "KokkosSparse_IOUtils.hpp" +#include +#include +#include +#include +#include -#include"gmres.hpp" +#include "gmres.hpp" -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) { typedef double ST; typedef int OT; - typedef Kokkos::DefaultExecutionSpace EXSP; + typedef Kokkos::DefaultExecutionSpace EXSP; - using ViewVectorType = Kokkos::View; + using ViewVectorType = Kokkos::View; - std::string filename("bcsstk09.mtx"); // example matrix - std::string ortho("CGS2"); //orthog type - int m = 50; //Max subspace size before restarting. - double convTol = 1e-10; //Relative residual convergence tolerance. - int cycLim = 50; //Maximum number of times to restart the solver. - bool rand_rhs = false; //Generate random right-hand side. + std::string filename("bcsstk09.mtx"); // example matrix + std::string ortho("CGS2"); // orthog type + int m = 50; // Max subspace size before restarting. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. - for (int i=1;i solverOpts; - solverOpts.tol = convTol; - solverOpts.m = m; + solverOpts.tol = convTol; + solverOpts.m = m; solverOpts.maxRestart = cycLim; - solverOpts.ortho = ortho; + solverOpts.ortho = ortho; + solverOpts.verbose = false; // No verbosity needed for most testing - //Initialize Kokkos AFTER parsing parameters: + // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - - // Read in a matrix Market file and use it to test the Kokkos Operator. - KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix>(filename.c_str()); - - int n = A.numRows(); - ViewVectorType X("X",n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - - if(rand_rhs){ - // Make rhs random. - int rand_seed = 123; - Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random(B, pool, -1,1); - } - else{ - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - } - - // Run GMRS solve: - GmresStats solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - ST nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - ST endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=========================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - + // Read in a matrix Market file and use it to test the Kokkos Operator. + KokkosSparse::CrsMatrix A = + KokkosSparse::Impl::read_kokkos_crst_matrix< + KokkosSparse::CrsMatrix>(filename.c_str()); + + int n = A.numRows(); + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + + if (rand_rhs) { + // Make rhs random. + int rand_seed = 123; + Kokkos::Random_XorShift64_Pool<> pool(rand_seed); + Kokkos::fill_random(B, pool, -1, 1); + } else { + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + } + + // Run GMRS solve: + GmresStats solveStats = + gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + ST nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + ST endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=========================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; } Kokkos::finalize(); - } - diff --git a/example/gmres/gmres.hpp b/example/gmres/gmres.hpp index 48a6e4ae0d..22b23cde7a 100644 --- a/example/gmres/gmres.hpp +++ b/example/gmres/gmres.hpp @@ -117,10 +117,12 @@ struct GmresOpts { typename Kokkos::Details::ArithTraits::mag_type tol; int m; int maxRestart; + bool verbose; std::string ortho; std::string precSide; - GmresOpts() : tol(1e-8), m(50), maxRestart(50), ortho("CGS2") {} + GmresOpts() + : tol(1e-8), m(50), maxRestart(50), verbose(true), ortho("CGS2") {} }; template @@ -182,7 +184,9 @@ GmresStats gmres( MT nrmB, trueRes, relRes, shortRelRes; GmresStats myStats; - std::cout << "Convergence tolerance is: " << opts.tol << std::endl; + if (opts.verbose) { + std::cout << "Convergence tolerance is: " << opts.tol << std::endl; + } ViewVectorType Xiter( "Xiter", n); // Intermediate solution at iterations before restart. @@ -229,7 +233,9 @@ GmresStats gmres( relRes = 0; } shortRelRes = relRes; - std::cout << "Initial relative residual is: " << relRes << std::endl; + if (opts.verbose) { + std::cout << "Initial relative residual is: " << relRes << std::endl; + } if (relRes < opts.tol) { converged = true; } @@ -311,8 +317,10 @@ GmresStats gmres( GVec_h(j) = GVec_h(j) * CosVal_h(j); shortRelRes = fabs(GVec_h(j + 1)) / nrmB; - std::cout << "Shortcut relative residual for iteration " - << j + (cycle * m) << " is: " << shortRelRes << std::endl; + if (opts.verbose) { + std::cout << "Shortcut relative residual for iteration " + << j + (cycle * m) << " is: " << shortRelRes << std::endl; + } if (tmpNrm <= 1e-14 && shortRelRes >= opts.tol) { throw std::runtime_error( "GMRES has experienced lucky breakdown, but the residual has not converged.\n\ @@ -359,8 +367,10 @@ GmresStats gmres( KokkosBlas::axpy(-one, Wj, Res); // r = b-Ax. trueRes = KokkosBlas::nrm2(Res); relRes = trueRes / nrmB; - std::cout << "True relative residual for iteration " << j + (cycle * m) - << " is : " << relRes << std::endl; + if (opts.verbose) { + std::cout << "True relative residual for iteration " + << j + (cycle * m) << " is : " << relRes << std::endl; + } numIters = j + 1; if (relRes < opts.tol) { @@ -390,15 +400,21 @@ GmresStats gmres( std::cout << "Ending relative residual is: " << relRes << std::endl; myStats.endRelRes = static_cast(relRes); if (converged) { - std::cout << "Solver converged! " << std::endl; + if (opts.verbose) { + std::cout << "Solver converged! " << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::Conv; } else if (shortRelRes < opts.tol) { - std::cout << "Shortcut residual converged, but solver experienced a loss " - "of accuracy." - << std::endl; + if (opts.verbose) { + std::cout << "Shortcut residual converged, but solver experienced a loss " + "of accuracy." + << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::LOA; } else { - std::cout << "Solver did not converge. :( " << std::endl; + if (opts.verbose) { + std::cout << "Solver did not converge. :( " << std::endl; + } myStats.convFlagVal = GmresStats::FLAG::NoConv; } if (cycle > 0) { @@ -406,8 +422,10 @@ GmresStats gmres( } else { myStats.numIters = 0; } - std::cout << "The solver completed " << myStats.numIters << " iterations." - << std::endl; + if (opts.verbose) { + std::cout << "The solver completed " << myStats.numIters << " iterations." + << std::endl; + } Kokkos::Profiling::popRegion(); return myStats; diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp index a19d6ad7e1..ad8d19fb03 100644 --- a/example/gmres/test_cmplx_A.cpp +++ b/example/gmres/test_cmplx_A.cpp @@ -44,6 +44,7 @@ #include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -65,6 +66,7 @@ int main(int /*argc*/, char** /*argv[]*/) { solverOpts.tol = 1e-05; // Relative residual convergence tolerance. solverOpts.maxRestart = 60; solverOpts.ortho = "CGS2"; // orthog type + solverOpts.verbose = false; // No verbosity needed for most testing bool pass1 = false; bool pass2 = false; @@ -76,7 +78,7 @@ int main(int /*argc*/, char** /*argv[]*/) { { // Read in a matrix Market file and use it to test the Kokkos Operator. KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix< + KokkosSparse::Impl::read_kokkos_crst_matrix< KokkosSparse::CrsMatrix>(filename.c_str()); int n = A.numRows(); diff --git a/example/gmres/test_prec.cpp b/example/gmres/test_prec.cpp index 852a735aa6..11122edccd 100644 --- a/example/gmres/test_prec.cpp +++ b/example/gmres/test_prec.cpp @@ -42,30 +42,30 @@ //@HEADER */ -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include "KokkosSparse_IOUtils.hpp" -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) { + typedef double ST; + typedef int OT; + typedef Kokkos::DefaultExecutionSpace EXSP; - typedef double ST; - typedef int OT; - typedef Kokkos::DefaultExecutionSpace EXSP; + using ViewVectorType = Kokkos::View; - using ViewVectorType = Kokkos::View; + std::string ortho("CGS2"); // orthog type + int n = 1000; // Matrix size + int m = 50; // Max subspace size before restarting. + double convTol = 1e-10; // Relative residual convergence tolerance. + int cycLim = 50; // Maximum number of times to restart the solver. + bool rand_rhs = false; // Generate random right-hand side. + bool pass = false; - std::string ortho("CGS2"); //orthog type - int n = 1000; //Matrix size - int m = 50; //Max subspace size before restarting. - double convTol = 1e-10; //Relative residual convergence tolerance. - int cycLim = 50; //Maximum number of times to restart the solver. - bool rand_rhs = false; //Generate random right-hand side. - bool pass = false; - - for (int i=1;i solverOpts; - solverOpts.tol = convTol; - solverOpts.m = m; + solverOpts.tol = convTol; + solverOpts.m = m; solverOpts.maxRestart = cycLim; - solverOpts.ortho = ortho; + solverOpts.ortho = ortho; + solverOpts.verbose = false; // No verbosity needed for most testing - //Initialize Kokkos AFTER parsing parameters: + // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. - KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::kk_generate_diag_matrix>(n); - KokkosSparse::Experimental::MatrixPrec * myPrec = - new KokkosSparse::Experimental::MatrixPrec( - KokkosKernels::Impl::kk_generate_diag_matrix>(n, true)); + // Generate a diagonal matrix with entries 1, 2, ...., 1000 and its inverse. + KokkosSparse::CrsMatrix A = + KokkosSparse::Impl::kk_generate_diag_matrix< + KokkosSparse::CrsMatrix>(n); + KokkosSparse::Experimental::MatrixPrec* + myPrec = + new KokkosSparse::Experimental::MatrixPrec( + KokkosSparse::Impl::kk_generate_diag_matrix< + KokkosSparse::CrsMatrix>(n, true)); - ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"),n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - int rand_seed = 123; - Kokkos::Random_XorShift64_Pool<> pool(rand_seed); - Kokkos::fill_random(X, pool, -1,1); //Use non-zero initial guess to test GMRES properties. - if(rand_rhs){ - Kokkos::fill_random(B, pool, -1,1); - } - else{ - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - } + ViewVectorType X(Kokkos::view_alloc(Kokkos::WithoutInitializing, "X"), + n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + int rand_seed = 123; + Kokkos::Random_XorShift64_Pool<> pool(rand_seed); + Kokkos::fill_random( + X, pool, -1, + 1); // Use non-zero initial guess to test GMRES properties. + if (rand_rhs) { + Kokkos::fill_random(B, pool, -1, 1); + } else { + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + } - GmresStats solveStats = gmres(A, B, X, solverOpts, myPrec); - - // Double check residuals at end of solve: - ST nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - ST endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=========================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - if( endRes < convTol && solveStats.numIters == 1){ - pass = true; - } + GmresStats solveStats = + gmres(A, B, X, solverOpts, myPrec); + // Double check residuals at end of solve: + ST nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + ST endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=========================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + if (endRes < convTol && solveStats.numIters == 1) { + pass = true; + } } Kokkos::finalize(); - if( pass ){ + if (pass) { std::cout << "Test passed!" << std::endl; - } - else{ + } else { std::cout << "Test Failed!" << std::endl; } - return ( pass ? EXIT_SUCCESS : EXIT_FAILURE ); + return (pass ? EXIT_SUCCESS : EXIT_FAILURE); } - diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp index 3f6edd06a3..abfb3f0101 100644 --- a/example/gmres/test_real_A.cpp +++ b/example/gmres/test_real_A.cpp @@ -44,6 +44,7 @@ #include #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -72,6 +73,7 @@ int main(int /*argc*/, char** /*argv[]*/) { solverOpts.m = 15; // Max subspace size before restarting. solverOpts.tol = 1e-10; // Relative residual convergence tolerance. solverOpts.maxRestart = 50; + solverOpts.verbose = false; // No verbosity needed for most testing bool pass1 = false; bool pass2 = false; @@ -88,7 +90,7 @@ int main(int /*argc*/, char** /*argv[]*/) { cOT diagDominance = 1; nnz = 10 * numRows; sp_matrix_type A = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< sp_matrix_type>(numRows, numCols, nnz, 0, ncOT(0.01 * numRows), diagDominance); diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp index 99b398e40c..e921ed06cd 100644 --- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp +++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp @@ -526,7 +526,9 @@ int main(int argc, char* argv[]) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Print out information about the configuration of the run if verbose_level // >= 5 diff --git a/example/half/CMakeLists.txt b/example/half/CMakeLists.txt new file mode 100644 index 0000000000..49553f573f --- /dev/null +++ b/example/half/CMakeLists.txt @@ -0,0 +1,7 @@ +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +KOKKOSKERNELS_ADD_EXECUTABLE( + xpy + SOURCES xpy.cpp + ) diff --git a/example/half/us-rse-escience-2022-reproducer.sh b/example/half/us-rse-escience-2022-reproducer.sh new file mode 100755 index 0000000000..8e77f72bc4 --- /dev/null +++ b/example/half/us-rse-escience-2022-reproducer.sh @@ -0,0 +1,239 @@ +#!/bin/bash +################################################################################ +# @Brief: On the specified arch, build and run xpy. +# +# Author: Evan Harvey +################################################################################ + +function envprint() { + for x in $@; do + echo $x:\$$x | envsubst + done +} + +function printhelp() { + echo "--Usage--" + echo "$0 HOST_ARCH " + echo " HOST_ARCH: POWER9, A64FX, SKX" + echo " ACCELERATOR_ARCH: VOLTA70, AMPERE80" + echo "" + echo "Invocations used to collect us-rse-escience-2022 results:" + echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh POWER9 VOLTA70" + echo "env KOKKOS_SRC_DIR=$HOME/KOKKOS.base/kokkos KOKKOSKERNELS_SRC_DIR=$HOME/KOKKOS.base/kokkos-kernels/ KOKKOSKERNELS_SHA=TODO-HEAD-SHA ./us-rse-escience-2022-reproducer.sh AMPERE80" +} + +function earlyexit() { + rm -rf $benchmark_dir + exit $1 +} + +function beval() { + local ret=0 + echo "---------------------------------------------------------------------------------------------------------------" + echo "START: \"$@\"" + if [ $dry_run == "off" ]; then + eval $@ + ret=$PIPESTATUS + fi + if [ $ret -ne 0 ]; then + echo "ERROR: \"$@\"" + earlyexit 1 + fi + echo "END : \"$@\"" + echo "---------------------------------------------------------------------------------------------------------------" +} + +# Handle input args +export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"} +export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) +export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} +export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} +export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) +envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA + +dry_run="off" +arch_names="$1 $2" +echo "HOST_ARCH=\"$1\", ACCELERATOR_ARCH=\"$2\"" + +# Create benchmark directory +benchmark_dir=$PWD/$0_$(date +"%Y-%m-%d_%H.%M.%S") +beval mkdir -p $benchmark_dir/kokkos-{build,install} +beval mkdir -p $benchmark_dir/kokkos-kernels-{build,install} +export KOKKOS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-build) +export KOKKOS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-install) +export KOKKOSKERNELS_BUILD_DIR=$(realpath $benchmark_dir/kokkos-kernels-build) +export KOKKOSKERNELS_INSTALL_DIR=$(realpath $benchmark_dir/kokkos-kernels-install) +envprint KOKKOS_INSTALL_DIR KOKKOS_BUILD_DIR KOKKOSKERNELS_BUILD_DIR KOKKOSKERNELS_INSTALL_DIR + +# Setup arch specific cmake configurations and job submission commands +if [[ "$arch_names" == " " ]]; then + printhelp; earlyexit 1 +elif [ "$arch_names" == "POWER9 VOLTA70" ]; then + module purge + module load cuda/11.2.0 gcc/8.3.1 cmake/3.18.0 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Power9,Volta70 --with-cuda=$CUDA_PATH -- --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="bsub -q normal -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SNB VOLTA70" ]; then + module purge + module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "AMPERE80" ]; then + module purge + module load cudatoolkit/11.2 cmake/3.22.0 + + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out" + + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &> kokkos_config_cmd.out" + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out" + + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "A64FX " ]; then + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=48 + module purge + module load gcc/10.2.0 cmake/3.17.0 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=A64FX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-msve-vector-bits=512 -Ofast' --arch=A64FX --with-openmp \ + --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SKX " ]; then + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=96 + module purge + module load gcc/7.2.0 cmake/3.19.3 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SKX \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --cxxflags='-O3' --arch=SKX --with-openmp --disable-tests --enable-examples \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" + use_simd="--use_simd=1" +else + echo "Invalid arch: $arch_names" + printhelp; earlyexit 1 +fi + +# Write the arch agnostic kokkos build script +echo "#!/bin/bash" > $KOKKOS_BUILD_DIR/build.sh +echo "cd $KOKKOS_BUILD_DIR" >> $KOKKOS_BUILD_DIR/build.sh +echo "make -j40 install" >> $KOKKOS_BUILD_DIR/build.sh +chmod +x $KOKKOS_BUILD_DIR/build.sh + +# Write the arch agnostic kokkos-kernels build script +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "cd $KOKKOSKERNELS_BUILD_DIR/example/half" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +echo "make -j40 xpy" >> $KOKKOSKERNELS_BUILD_DIR/build.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/build.sh + +# Write the arch agnostic kokkos-kernels benchmark script +echo "#!/bin/bash" > $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10 0 &> xpy_relative_error-10.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100 0 &> xpy_relative_error-100.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 1000 0 &> xpy_relative_error-1000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 10000 0 &> xpy_relative_error-10000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 100000 0 &> xpy_relative_error-100000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh + +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000 1 &> xpy_runtime_only-50000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000 1 &> xpy_runtime_only-500000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 5000000 1 &> xpy_runtime_only-5000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 50000000 1 &> xpy_runtime_only-50000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +echo "$KOKKOSKERNELS_BUILD_DIR/example/half/xpy 500000000 1 &> xpy_runtime_only-500000000.out" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh +chmod +x $KOKKOSKERNELS_BUILD_DIR/bench.sh + +# Check out the correct SHAs +beval "cd $KOKKOS_SRC_DIR && git checkout $KOKKOS_SHA" +beval "cd $KOKKOSKERNELS_SRC_DIR && git checkout $KOKKOSKERNELS_SHA" + +# Build Kokkos +beval $kokkos_config_cmd +beval $kokkos_config_defaults_cmd +beval $kokkos_build_cmd + +# Wait for the file system on the head node to catch up +while [[ "$arch_names" == "POWER9 VOLTA70" && ! -e $KOKKOS_INSTALL_DIR/bin/nvcc_wrapper ]]; do + sleep 3s +done + +# Build KokkosKernels +beval $kokkoskernels_config_cmd +beval $kokkoskernels_config_defaults_cmd +beval $kokkoskernels_build_cmd + +# Run the benchmark +beval $benchmark_cmd diff --git a/example/half/xpy.cpp b/example/half/xpy.cpp new file mode 100644 index 0000000000..bc6bf7481d --- /dev/null +++ b/example/half/xpy.cpp @@ -0,0 +1,137 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER + +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_default_types.hpp" + +template +struct Functor_xpy { + ViewType x, y; + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { x(i) += y(i); } +}; + +template +void do_xpy(size_t n, bool time_only = false) { + using namespace Kokkos; + using ExecutionSpace = typename DeviceType::execution_space; + using ViewType = View; + using ReferenceScalarType = double; + + ViewType x("x", n); + ViewType y("y", n); + View x_rand("x_rand", n); + View y_rand("y_rand", n); + + View expected("expected", n); + View relative_error( + "relative_error", n); + typename ViewType::HostMirror x_host = create_mirror_view(x); + typename ViewType::HostMirror y_host = create_mirror_view(y); + // TODO: Report segfault in random_pool creation with: + // typename ViewType::HostMirror y_host = create_mirror_view(y_host); + + Random_XorShift64_Pool random_pool(12345); + fill_random(x_rand, random_pool, ReferenceScalarType(1.0), + ReferenceScalarType(2.0)); + fill_random(y_rand, random_pool, ReferenceScalarType(1.0), + ReferenceScalarType(2.0)); + ExecutionSpace().fence(); + + deep_copy(x, x_rand); + deep_copy(y, y_rand); + ExecutionSpace().fence(); + + deep_copy(x_host, x); + deep_copy(y_host, y); + ExecutionSpace().fence(); + + Functor_xpy xpy; + xpy.x = x; + xpy.y = y; + Timer timer; + parallel_for("xpy", n, xpy); + ExecutionSpace().fence(); + double s = timer.seconds(); + + if (!time_only) { + for (size_t i = 0; i < n; i++) + expected(i) = static_cast(y_host(i)) + + static_cast(x_host(i)); + } + + deep_copy(x_host, x); + ExecutionSpace().fence(); + + std::cout << "n: " << n << ", " << typeid(ScalarType).name() + << " Runtime(s): " << s << std::endl; + + if (!time_only) { + std::cout << "n: " << n << ", " << typeid(ScalarType).name() + << " Relative Errors:" << std::endl; + for (size_t i = 0; i < n; i++) { + std::cout << ", " << std::abs(expected(i) - x_host(i)) / expected(i) + << std::endl; + } + std::cout << std::endl << std::endl; + } +} + +int main(int argc, char **argv) { + Kokkos::initialize(); + if (argc < 2) { + std::cout << "./" << argv[0] << " N:Z TIME_ONLY:{0,1}" << std::endl; + Kokkos::finalize(); + return 1; + } + using LayoutType = Kokkos::LayoutLeft; + using DeviceType = default_device; + size_t n = atoi(argv[1]); + bool time_only = static_cast(atoi(argv[2])); + do_xpy(n, time_only); + do_xpy(n, time_only); + do_xpy(n, time_only); + Kokkos::finalize(); + return 0; +} \ No newline at end of file diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp index 9909c55720..aec112b584 100644 --- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp +++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp @@ -384,7 +384,9 @@ int main(int argc, char* argv[]) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); if (params.verbose) { Kokkos::print_configuration(std::cout); diff --git a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp index 1fc1fc37d2..ce171c46bd 100644 --- a/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp +++ b/example/wiki/sparse/KokkosSparse_wiki_gauss_seidel.cpp @@ -2,88 +2,96 @@ #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosBlas1_nrm2.hpp" -//Parallel Gauss-Seidel Preconditioner/Smoother +// Parallel Gauss-Seidel Preconditioner/Smoother // -Uses graph coloring to find independent row sets, // and applies GS to each set in parallel // -Here, use to solve a diagonally dominant linear system directly. -//Helper to print out colors in the shape of the grid -int main() -{ - using Scalar = default_scalar; - using Mag = Kokkos::ArithTraits::mag_type; - using Ordinal = default_lno_t; - using Offset = default_size_type; +// Helper to print out colors in the shape of the grid +int main() { + using Scalar = default_scalar; + using Mag = Kokkos::ArithTraits::mag_type; + using Ordinal = default_lno_t; + using Offset = default_size_type; using ExecSpace = Kokkos::DefaultExecutionSpace; - using MemSpace = typename ExecSpace::memory_space; - using Device = Kokkos::Device; - using Handle = KokkosKernels::Experimental:: - KokkosKernelsHandle; - using Matrix = KokkosSparse::CrsMatrix; - using Vector = typename Matrix::values_type; + using MemSpace = typename ExecSpace::memory_space; + using Device = Kokkos::Device; + using Handle = KokkosKernels::Experimental::KokkosKernelsHandle< + Offset, Ordinal, default_scalar, ExecSpace, MemSpace, MemSpace>; + using Matrix = KokkosSparse::CrsMatrix; + using Vector = typename Matrix::values_type; constexpr Ordinal numRows = 10000; - const Scalar one = Kokkos::ArithTraits::one(); - const Mag magOne = Kokkos::ArithTraits::one(); - //Solve tolerance + const Scalar one = Kokkos::ArithTraits::one(); + const Mag magOne = Kokkos::ArithTraits::one(); + // Solve tolerance const Mag tolerance = 1e-6 * magOne; Kokkos::initialize(); { - //Generate a square, strictly diagonally dominant, but nonsymmetric matrix on which Gauss-Seidel should converge. - //Get approx. 20 entries per row - //Diagonals are 2x the absolute sum of all other entries. + // Generate a square, strictly diagonally dominant, but nonsymmetric matrix + // on which Gauss-Seidel should converge. Get approx. 20 entries per row + // Diagonals are 2x the absolute sum of all other entries. Offset nnz = numRows * 20; - Matrix A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix(numRows, numRows, nnz, 2, 100, 1.05 * one); - std::cout << "Generated a matrix with " << numRows << " rows/cols, and " << nnz << " entries.\n"; - //Create a kernel handle, then a Gauss-Seidel handle with the default algorithm + Matrix A = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + Matrix>(numRows, numRows, nnz, 2, 100, 1.05 * one); + std::cout << "Generated a matrix with " << numRows << " rows/cols, and " + << nnz << " entries.\n"; + // Create a kernel handle, then a Gauss-Seidel handle with the default + // algorithm Handle handle; handle.create_gs_handle(KokkosSparse::GS_DEFAULT); - //Set up Gauss-Seidel for the graph (matrix sparsity pattern) - KokkosSparse::Experimental::gauss_seidel_symbolic(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); - //Set up Gauss-Seidel for the matrix values (numeric) - //Another matrix with the same sparsity pattern could re-use the handle and symbolic phase, and only call numeric. - KokkosSparse::Experimental::gauss_seidel_numeric(&handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, false); - //Now, preconditioner is ready to use. Set up an unknown vector (uninitialized) and randomized right-hand-side vector. + // Set up Gauss-Seidel for the graph (matrix sparsity pattern) + KokkosSparse::Experimental::gauss_seidel_symbolic( + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, false); + // Set up Gauss-Seidel for the matrix values (numeric) + // Another matrix with the same sparsity pattern could re-use the handle and + // symbolic phase, and only call numeric. + KokkosSparse::Experimental::gauss_seidel_numeric( + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, + false); + // Now, preconditioner is ready to use. Set up an unknown vector + // (uninitialized) and randomized right-hand-side vector. Vector x(Kokkos::view_alloc(Kokkos::WithoutInitializing, "x"), numRows); Vector b(Kokkos::view_alloc(Kokkos::WithoutInitializing, "b"), numRows); Vector res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "res"), numRows); auto bHost = Kokkos::create_mirror_view(b); - for(Ordinal i = 0; i < numRows; i++) + for (Ordinal i = 0; i < numRows; i++) bHost(i) = 3 * ((one * rand()) / RAND_MAX); Kokkos::deep_copy(b, bHost); - //Measure initial residual norm ||Ax - b||, where x is 0 - Mag initialRes = KokkosBlas::nrm2(b); + // Measure initial residual norm ||Ax - b||, where x is 0 + Mag initialRes = KokkosBlas::nrm2(b); Mag scaledResNorm = magOne; - bool firstIter = true; - //Iterate until reaching the tolerance + bool firstIter = true; + // Iterate until reaching the tolerance int numIters = 0; - while(scaledResNorm > tolerance) - { - //Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0) - //If this is the first iteration, tell apply: + while (scaledResNorm > tolerance) { + // Run one sweep of forward Gauss-Seidel (SOR with omega = 1.0) + // If this is the first iteration, tell apply: // * to zero out x (it was uninitialized) - // * that b has changed since the previous apply (since there was no previous apply) + // * that b has changed since the previous apply (since there was no + // previous apply) KokkosSparse::Experimental::forward_sweep_gauss_seidel_apply( - &handle, numRows, numRows, - A.graph.row_map, A.graph.entries, A.values, + &handle, numRows, numRows, A.graph.row_map, A.graph.entries, A.values, x, b, firstIter, firstIter, one, 1); firstIter = false; - //Now, compute the new residual norm using SPMV + // Now, compute the new residual norm using SPMV Kokkos::deep_copy(res, b); - //Compute res := Ax - res (since res is now equal to b, this is Ax - b) + // Compute res := Ax - res (since res is now equal to b, this is Ax - b) KokkosSparse::spmv("N", one, A, x, -one, res); - //Recompute the scaled norm + // Recompute the scaled norm scaledResNorm = KokkosBlas::nrm2(res) / initialRes; numIters++; - std::cout << "Iteration " << numIters << " scaled residual norm: " << scaledResNorm << '\n'; + std::cout << "Iteration " << numIters + << " scaled residual norm: " << scaledResNorm << '\n'; } std::cout << "SUCCESS: converged in " << numIters << " iterations.\n"; } Kokkos::finalize(); return 0; } - diff --git a/master_history.txt b/master_history.txt index ddf9143c73..91399d7ba0 100644 --- a/master_history.txt +++ b/master_history.txt @@ -17,3 +17,4 @@ tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86 tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 tag: 3.6.01 date: 05/23/2022 master: e09389ae release: e1d8de42 +tag: 3.7.00 date: 08/25/2022 master: 42ab7a29 release: 9cc88ffa diff --git a/perf_test/batched/CMakeLists.txt b/perf_test/batched/CMakeLists.txt index 36435ecfc1..d044cf021f 100644 --- a/perf_test/batched/CMakeLists.txt +++ b/perf_test/batched/CMakeLists.txt @@ -1,9 +1,9 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag - SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp -) -KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi - SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp -) +#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockTridiag +# SOURCES KokkosBatched_Test_BlockTridiagDirect.cpp +#) +#KOKKOSKERNELS_ADD_EXECUTABLE(KokkosBatched_Test_BlockJacobi +# SOURCES KokkosBatched_Test_BlockJacobi_Tutorial.cpp +#) diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index f3237d9b4f..94f58fba83 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -3,16 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKJACOBI -#endif -#endif -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" @@ -79,6 +69,152 @@ val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, return residual; } +namespace ConstructBlockJacobi { +template +struct Task1Factorize { + private: + VT __A; + + public: + Task1Factorize(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, AA); + } +}; + +template +struct Task1SetIdentity { + private: + VT __A; + + public: + Task1SetIdentity(VT A) : __A(A) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamSetIdentity::invoke(member, AA); + } +}; + +template +struct Task1SolveLowerTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveLowerTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, AA); + } +}; + +template +struct Task1SolveUpperTriangular { + private: + VTA __A; + VTT __T; + + public: + Task1SolveUpperTriangular(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; +} // namespace ConstructBlockJacobi + +template +struct Task1ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task1ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + +template +struct Task2FactorizeInvert { + private: + VTA __A; + VTT __T; + + public: + Task2FactorizeInvert(VTA A, VTT T) : __A(A), __T(T) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const val_type one(1); + const int i = member.league_rank(); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(__T, i, Kokkos::ALL(), Kokkos::ALL()); + + TeamLU::invoke(member, AA); + TeamCopy::invoke(member, AA, TT); + TeamSetIdentity::invoke(member, AA); + TeamTrsm::invoke(member, one, TT, AA); + TeamTrsm::invoke(member, one, TT, + AA); + } +}; + +template +struct Task2ApplyBlockJacobi { + private: + VTA __A; + VTX __x; + VTB __b; + + public: + Task2ApplyBlockJacobi(VTA A, VTX x, VTB b) : __A(A), __x(x), __b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(__A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(__x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(__b, i, Kokkos::ALL()); + TeamGemv::invoke( + member, one, AA, bb, zero, xx); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -159,44 +295,21 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task1.factorize", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, AA); - }); + ConstructBlockJacobi::Task1Factorize(A)); Kokkos::deep_copy(T, A); Kokkos::parallel_for( "task1.set-identity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamSetIdentity::invoke(member, AA); - }); + ConstructBlockJacobi::Task1SetIdentity(A)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-lower-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, one, - TT, AA); - }); + ConstructBlockJacobi::Task1SolveLowerTriangular(A, T)); Kokkos::fence(); Kokkos::parallel_for( "task1.solve-upper-triangular", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + ConstructBlockJacobi::Task1SolveUpperTriangular(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -211,16 +324,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task1.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task1ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 1: application of jacobi time = %f , # of applications per " @@ -256,23 +361,7 @@ int main(int argc, char *argv[]) { timer.reset(); Kokkos::parallel_for( "task2.factorize-invert", policy, - KOKKOS_LAMBDA(const member_type &member) { - const val_type one(1); - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - - TeamLU::invoke(member, AA); - TeamCopy::invoke(member, AA, TT); - TeamSetIdentity::invoke(member, AA); - TeamTrsm::invoke(member, one, - TT, AA); - TeamTrsm::invoke(member, - one, TT, - AA); - }); + Task2FactorizeInvert(A, T)); Kokkos::fence(); const double t = timer.seconds(); printf( @@ -287,16 +376,8 @@ int main(int argc, char *argv[]) { policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::parallel_for( "task2.apply-block-jacobi", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv::invoke(member, one, AA, bb, - zero, xx); - }); + Task2ApplyBlockJacobi(A, x, + b)); const double t = timer.seconds(); printf( "task 2: application of jacobi time = %f , # of applications per " @@ -318,7 +399,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index a8b3de209b..ffa6efec5e 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,16 +3,6 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT -#endif -#endif -#endif - -#if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) - /// KokkosKernels headers #include "KokkosBatched_Util.hpp" #include "KokkosBatched_Vector.hpp" @@ -47,11 +37,13 @@ #define KOKKOSBATCHED_USE_128BIT_MEMORY_INST -typedef Kokkos::DefaultExecutionSpace exec_space; -typedef typename exec_space::memory_space memory_space; -typedef Kokkos::DefaultHostExecutionSpace host_space; +using exec_space_type = Kokkos::DefaultExecutionSpace; +using memory_space_type = exec_space_type::memory_space; +using host_space_type = Kokkos::DefaultHostExecutionSpace; -typedef double value_type; +using value_type = double; +using policy_type = Kokkos::TeamPolicy; +using member_type = typename policy_type::member_type; /// 128*128*128/16*5 * (2*8) / 16 /// @@ -60,10 +52,10 @@ typedef double value_type; using namespace KokkosBatched; static constexpr int vector_length = - DefaultVectorLength::value; + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) static constexpr int internal_vector_length = - DefaultInternalVectorLength::value; + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif @@ -75,40 +67,161 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct FactorizeModeAndAlgo; -template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct FactorizeModeAndAlgo { +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoHostImpl {}; +#endif + +struct FactorizeModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct FactorizeModeAndAlgo : FactorizeModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct FactorizeModeAndAlgo + : FactorizeModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif +template +struct SetTridiagToIdentity { + private: + VT __AA; + + public: + SetTridiagToIdentity(VT AA) : __AA(AA) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, __AA.extent(1)), [&](const int &j) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), + [&](const int &v) { + for (int k = 0, kend = __AA.extent(3); k < kend; ++k) + __AA(i, j, 1, k, k, v) = 1; + }); + }); + } +}; + +template +struct Factorize { + private: + VT __AA; + LT __L; + + public: + Factorize(VT AA, LT L) : __AA(AA), __L(L) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type &member) const { + typedef FactorizeModeAndAlgo + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, __AA.extent(5)), [&](const int &v) { + auto AAA = Kokkos::subview(__AA, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (__L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (__L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke(member, 1.0, A, B); + Trsm::invoke(member, 1.0, A, + C); + Gemm::invoke(member, -1.0, C, B, 1.0, D); + } + LU::invoke(member, D); + } + }); + } +}; + int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { @@ -149,53 +262,56 @@ int main(int argc, char *argv[]) { /// /// double 16 - Kokkos::View Av( + Kokkos::View Av( "A", N / vector_length, L, 3, Blk, Blk); /// double - Kokkos::View As( + Kokkos::View As( (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View + Kokkos::View Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), Av.extent(3), Av.extent(4), vector_length / internal_vector_length); /// double 16 - Kokkos::View xv( + Kokkos::View xv( "x", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View xs( + Kokkos::View xs( (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), xv.extent(3), vector_length / internal_vector_length); /// double 16 - Kokkos::View bv( + Kokkos::View bv( "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs( + Kokkos::View bs( (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length); /// double 2 - Kokkos::View + Kokkos::View bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy( + Kokkos::View Acopy( "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), As.extent(4), As.extent(5)); - Kokkos::View rs( + Kokkos::View rs( "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), bs.extent(4)); @@ -217,24 +333,9 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for( - "setTridiagToIdentity", policy, - KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, AA.extent(1)), - [&](const int &j) { - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - for (int k = 0, kend = AA.extent(3); k < kend; ++k) - AA(i, j, 1, k, k, v) = 1; - }); - }); - }); + Kokkos::parallel_for("setTridiagToIdentity", policy, + SetTridiagToIdentity(AA)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -246,7 +347,7 @@ int main(int argc, char *argv[]) { /// randomize input { const value_type one(1); - Kokkos::Random_XorShift64_Pool random(13245); + Kokkos::Random_XorShift64_Pool random(13245); Kokkos::fill_random(As, random, one); Kokkos::fill_random(bs, random, one); @@ -261,9 +362,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -273,59 +372,9 @@ int main(int argc, char *argv[]) { } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for( - "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> - default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank(); - - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, AA.extent(5)), - [&](const int &v) { - auto AAA = - Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), - Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = - Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = - Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = - Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (L == 1) { - A.assign_data(&AAA(0, 1, 0, 0)); - LU::invoke(member, A); - } else { - for (int k = 0; k < (L - 1); ++k) { - A.assign_data(&AAA(k, 1, 0, 0)); - B.assign_data(&AAA(k, 2, 0, 0)); - C.assign_data(&AAA(k, 0, 0, 0)); - D.assign_data(&AAA(k + 1, 1, 0, 0)); - - LU::invoke(member, A); - Trsm::invoke(member, 1.0, A, B); - Trsm::invoke(member, 1.0, A, C); - Gemm::invoke(member, -1.0, C, B, - 1.0, D); - } - LU::invoke(member, D); - } - }); - }); + Kokkos::parallel_for("factorize", + policy.set_scratch_size(0, Kokkos::PerTeam(S)), + Factorize(AA, L)); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -343,9 +392,7 @@ int main(int argc, char *argv[]) { cudaProfilerStart(); #endif timer.reset(); - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - int team_size = 0; + int team_size = 0; if (Blk < 8) { team_size = 32 / AA.extent(5); } else if (Blk < 12) { @@ -359,7 +406,7 @@ int main(int argc, char *argv[]) { Kokkos::parallel_for( "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -488,8 +535,6 @@ int main(int argc, char *argv[]) { /// if (1) { typedef KokkosBatched::Algo::Level2::Unblocked algo_type; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); Kokkos::parallel_for( "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { @@ -639,7 +684,3 @@ int main(int argc, char *argv[]) { return 0; } - -#else -int main() { return 0; } -#endif diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp index fb9cd6297d..8513cad752 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagJacobi.cpp @@ -3,11 +3,9 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) -#if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) +#if !(defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_CUDA_LAMBDA)) #define KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI #endif -#endif #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGJACOBI) @@ -75,38 +73,86 @@ typedef Vector, internal_vector_length> internal_vector_type; typedef value_type internal_vector_type; #endif -template +template struct InverseDiagonalsModeAndAlgo; -template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level3::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_ONPENMP) template <> -struct InverseDiagonalsModeAndAlgo { +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoHostImpl {}; +#endif + +struct InverseDiagonalsModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level3::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct InverseDiagonalsModeAndAlgo + : InverseDiagonalsModeAndAlgoDeviceImpl {}; #endif -template +template struct SolveModeAndAlgo; -template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgoHostImpl { typedef Mode::Serial mode_type; typedef Algo::Level2::Blocked algo_type; }; -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_SERIAL) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_THREADS) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_OPENMP) template <> -struct SolveModeAndAlgo { +struct SolveModeAndAlgo : SolveModeAndAlgoHostImpl {}; +#endif + +struct SolveModeAndAlgoDeviceImpl { typedef Mode::Team mode_type; typedef Algo::Level2::Unblocked algo_type; }; + +#if defined(KOKKOS_ENABLE_CUDA) +template <> +struct SolveModeAndAlgo : SolveModeAndAlgoDeviceImpl {}; +#endif + +#if defined(KOKKOS_ENABLE_HIP) +template <> +struct SolveModeAndAlgo + : SolveModeAndAlgoDeviceImpl {}; #endif int main(int argc, char *argv[]) { @@ -282,8 +328,7 @@ int main(int argc, char *argv[]) { policy.set_scratch_size( 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; @@ -365,8 +410,7 @@ int main(int argc, char *argv[]) { 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), KOKKOS_LAMBDA(const member_type &member) { - typedef SolveModeAndAlgo< - Kokkos::Impl::ActiveExecutionMemorySpace> + typedef SolveModeAndAlgo default_mode_and_algo_type; typedef default_mode_and_algo_type::mode_type mode_type; typedef default_mode_and_algo_type::algo_type algo_type; diff --git a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp index 49032307c4..7b353cf160 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_mv_perf_test.cpp @@ -50,6 +50,7 @@ struct Params { int use_cuda = 0; int use_hip = 0; + int use_sycl = 0; int use_openmp = 0; int use_threads = 0; // m is vector length @@ -63,7 +64,8 @@ void print_options() { std::cerr << "Options:\n" << std::endl; std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | " - "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" + "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | " + "'--sycl [syclDeviceIndex]'" << std::endl; std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl; std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " @@ -90,6 +92,8 @@ int parse_inputs(Params& params, int argc, char** argv) { params.use_cuda = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { params.m = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { @@ -190,17 +194,21 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = std::max(params.use_cuda, params.use_hip) - 1; + const int device_id = + std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; bool useHIP = params.use_hip != 0; - bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL; if (useThreads) { #if defined(KOKKOS_ENABLE_THREADS) @@ -234,6 +242,14 @@ int main(int argc, char** argv) { #else std::cout << "ERROR: HIP requested, but not available.\n"; return 1; +#endif + } + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params.m, params.n, params.repeat); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; #endif } if (useSerial) { diff --git a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp index 9219d34810..50840ddea6 100644 --- a/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_dot_perf_test.cpp @@ -54,6 +54,8 @@ struct Params { int use_cuda = 0; int use_openmp = 0; int use_threads = 0; + int use_hip = 0; + int use_sycl = 0; // m is vector length int m = 100000; int repeat = 1; @@ -63,7 +65,8 @@ void print_options() { std::cerr << "Options:\n" << std::endl; std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | " - "'--cuda [cudaDeviceIndex]'" + "'--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]' | " + "'--sycl [syclDeviceIndex]'" << std::endl; std::cerr << "\tIf no BACKEND selected, serial is the default." << std::endl; std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " @@ -86,6 +89,10 @@ int parse_inputs(Params& params, int argc, char** argv) { params.use_openmp = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { params.m = atoi(argv[++i]); } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { @@ -184,16 +191,21 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - const int device_id = params.use_cuda - 1; + const int device_id = + std::max(std::max(params.use_cuda, params.use_hip), params.use_sycl) - 1; const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; - bool useSerial = !useThreads && !useOMP && !useCUDA; + bool useHIP = params.use_hip != 0; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useThreads && !useOMP && !useCUDA && !useHIP && !useSYCL; if (useThreads) { #if defined(KOKKOS_ENABLE_THREADS) @@ -221,6 +233,25 @@ int main(int argc, char** argv) { return 1; #endif } + + if (useHIP) { +#if defined(KOKKOS_ENABLE_HIP) + run(params.m, params.repeat); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + run(params.m, params.repeat); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + if (useSerial) { #if defined(KOKKOS_ENABLE_SERIAL) run(params.m, params.repeat); diff --git a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp index f8a2a5aa43..eeb49d6502 100644 --- a/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp +++ b/perf_test/blas/blas1/KokkosBlas_team_dot_perf_test.cpp @@ -188,7 +188,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useThreads = params.use_threads != 0; bool useOMP = params.use_openmp != 0; diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp index a82ece030b..98e974229b 100644 --- a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp +++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp @@ -180,7 +180,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); const int device_id = std::max(params.use_cuda, params.use_hip) - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Create booleans to handle pthreads, openmp and cuda params and initialize // to true; diff --git a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh index 0b08977748..3b382a474c 100755 --- a/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh +++ b/perf_test/blas/blas3/KokkosBatched_BatchedGemm_benchmark.sh @@ -18,8 +18,8 @@ function printhelp() { echo "--Usage--" echo "$0 PRECISION HOST_ARCH " echo " PRECISION: Kokkos::Experimental::half_t, float, double" - echo " HOST_ARCH: POWER9, A64FX, SKX" - echo " ACCELERATOR_ARCH: VOLTA70" + echo " HOST_ARCH: POWER9, A64FX, SKX, SNB, DEFAULT" + echo " ACCELERATOR_ARCH: VOLTA70 AMPERE80" echo "" } @@ -47,10 +47,10 @@ function beval() { # Handle input args export KOKKOS_SRC_DIR=${KOKKOS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos"} export KOKKOS_SRC_DIR=$(realpath $KOKKOS_SRC_DIR) -export KOKKOS_SHA=${KOKKOS_SHA:-"b9f15a4"} # Tip of develop as of 10-14-21 +export KOKKOS_SHA=${KOKKOS_SHA:-"tags/3.6.00"} export KOKKOSKERNELS_SRC_DIR=${KOKKOSKERNELS_SRC_DIR:-"$HOME/KOKKOS.base/kokkos-kernels"} export KOKKOSKERNELS_SRC_DIR=$(realpath $KOKKOSKERNELS_SRC_DIR) -export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"a2fff48"} # Tip of developer as of 10-14-21 +export KOKKOSKERNELS_SHA=${KOKKOSKERNELS_SHA:-"tags/papers/us-rse-escience-2022"} envprint KOKKOS_SRC_DIR KOKKOS_SHA KOKKOSKERNELS_SRC_DIR KOKKOSKERNELS_SHA dry_run="off" @@ -82,7 +82,7 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --arch=Power9,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ - --cxxflags='-O3' --with-scalars=$precision \ + --cxxflags='-O3' --disable-tests --enable-examples --with-scalars=$precision \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ tee kokkoskernels_config_cmd.out" @@ -93,6 +93,49 @@ elif [ "$arch_names" == "POWER9 VOLTA70" ]; then kokkos_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="bsub -q rhel7W -W 2:00 -Is $KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "SNB VOLTA70" ]; then + module purge + module load sems-archive-env sems-env sems-gcc/8.3.0 sems-cmake/3.19.1 cuda/11.2 sems-archive-git/2.10.1 + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ + | tee -a kokkos_config_cmd.out" + + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=SNB,Volta70 --with-cuda=$CUDA_PATH --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --with-scalars=$precision \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR 2>&1 | \ + tee kokkoskernels_config_cmd.out" + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ + $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" +elif [ "$arch_names" == "DEFAULT AMPERE80" ]; then + module purge + module load cudatoolkit/11.2 cmake/3.22.0 + + kokkos_config_cmd="cd $KOKKOS_BUILD_DIR && $KOKKOS_SRC_DIR/generate_makefile.bash --cxxflags='-O3' \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_SRC_DIR/bin/nvcc_wrapper \ + --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR &> kokkos_config_cmd.out" + + kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR &> kokkos_config_cmd.out" + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ + --arch=Ampere80 --with-cuda=$CUDA_HOME --compiler=$KOKKOS_INSTALL_DIR/bin/nvcc_wrapper \ + --cxxflags='-O3' --with-scalars=$precision \ + --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ + --kokkos-prefix=$KOKKOS_INSTALL_DIR --prefix=$KOKKOSKERNELS_INSTALL_DIR &> kokkoskernels_config_cmd.out" + + kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -S $KOKKOSKERNELS_SRC_DIR -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF &> kokkoskernels_config_cmd.out" + + kokkos_build_cmd="$KOKKOS_BUILD_DIR/build.sh" + kokkoskernels_build_cmd="$KOKKOSKERNELS_BUILD_DIR/build.sh" + benchmark_cmd="$KOKKOSKERNELS_BUILD_DIR/bench.sh" elif [ "$arch_names" == "A64FX " ]; then export OMP_PROC_BIND=close export OMP_PLACES=cores @@ -128,7 +171,7 @@ elif [ "$arch_names" == "SKX " ]; then --kokkos-path=$KOKKOS_SRC_DIR --prefix=$KOKKOS_INSTALL_DIR 2>&1 | tee kokkos_config_cmd.out" kokkos_config_defaults_cmd="cd $KOKKOS_BUILD_DIR && cmake -DKokkos_ENABLE_TESTS:BOOL=OFF $KOKKOS_SRC_DIR 2>&1 \ | tee -a kokkos_config_cmd.out" - + kokkoskernels_config_cmd="cd $KOKKOSKERNELS_BUILD_DIR && $KOKKOSKERNELS_SRC_DIR/cm_generate_makefile.bash \ --cxxflags='-O3' --arch=SKX --with-scalars=$precision --with-openmp \ --kokkos-path=$KOKKOS_SRC_DIR --kokkoskernels-path=$KOKKOSKERNELS_SRC_DIR \ @@ -137,7 +180,7 @@ elif [ "$arch_names" == "SKX " ]; then kokkoskernels_config_defaults_cmd="cd $KOKKOSKERNELS_BUILD_DIR && cmake -DKokkosKernels_INST_LAYOUTLEFT:BOOL=OFF \ -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON -DKokkosKernels_INST_DOUBLE:BOOL=OFF \ $KOKKOSKERNELS_SRC_DIR 2>&1 | tee -a kokkoskernels_config_cmd.out" - + kokkos_build_cmd="srun --time=2:00:00 -N1 $KOKKOS_BUILD_DIR/build.sh" kokkoskernels_build_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/build.sh" benchmark_cmd="srun --time=2:00:00 -N1 $KOKKOSKERNELS_BUILD_DIR/bench.sh" @@ -165,7 +208,7 @@ echo "cd $benchmark_dir" >> $KOKKOSKERNELS_BUILD_DIR/bench.sh echo "$KOKKOSKERNELS_BUILD_DIR/perf_test/blas/blas3/KokkosBlas3_perf_test \ --test=batched_heuristic --routines=gemm --loop_type=parallel --batch_size_last_dim=0 \ --matrix_size_start=2x2,2x2,2x2 --matrix_size_stop=64x64,64x64,64x64 \ - --matrix_size_step=2 --batch_size=1024 \ + --matrix_size_step=2 --batch_size=$((32*1024)) \ --warm_up_loop=10 --iter=20 --verify=1 \ ${use_simd} \ --csv=${benchmark_dir}/${precision}_bench.csv" \ diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index e3d991c7c1..d1855573e4 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -263,9 +263,11 @@ static std::string gemm_csv_header_str = // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { + // TODO: if not Kokkos::complex. if (std::is_same::value || std::is_same::value || - std::is_same::value) + std::is_same::value || + std::is_same::value) return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each @@ -1574,8 +1576,8 @@ static inline bool __gemm_print_compare_failure(ViewType h_expected, ViewType h_actual, int i, int j, int k, double epsilon) { STATUS; - auto diff = static_cast(Kokkos::Experimental::fabs( - static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); + auto diff = + std::fabs(static_cast(h_expected(i, j, k) - h_actual(i, j, k))); if (diff > epsilon) { printf( @@ -1775,6 +1777,11 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, Kokkos::deep_copy(dst, h_dst); Kokkos::fence(); #else + // Avoid unused parameter warnings: + (void)src; + (void)dst; + (void)options; + Kokkos::abort( "Cannot perform simd verification with cuda/10.2.2, rerun with -v 0"); #endif // #if (CUDA_VERSION != 10020) @@ -1883,7 +1890,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, // Check the result if (gemm_args.C.data() != nullptr) { -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { using view_type_2d = Kokkos::View; @@ -1908,7 +1915,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, } } } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058 if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Result value mismatch!"); } @@ -2078,7 +2085,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { Kokkos::fence(); } -#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) +#if defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) && ARMPL_BUILD >= 1058 if (options.test == EXPERIMENT) { armpl_int_t bstrd_A, istrd_A, jstrd_A, bstrd_B, istrd_B, jstrd_B, bstrd_C, istrd_C, jstrd_C; @@ -2168,7 +2175,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.B_pl.mat = B_p; gemm_args.C_pl.mat = C_p; } -#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL +#endif // KOKKOSKERNELS_ENABLE_TPL_ARMPL && ARMPL_BUILD >= 1058 gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp index 595292ebd7..6497db8de3 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_standalone_perf_test.cpp @@ -158,7 +158,9 @@ int main(int argc, char** argv) { // as number of threads const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; diff --git a/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md new file mode 100644 index 0000000000..e558abbff6 --- /dev/null +++ b/perf_test/blas/blas3/papers/kokkos-half-t-rse-escience-2022/reproducer.md @@ -0,0 +1,26 @@ +## To reproduce the half precision results for batched-GEMM: +```bash +git clone https://github.com/kokkos/kokkos.git +git clone https://github.com/kokkos/kokkos-kernels.git +cd kokkos-kernels +git checkout tags/papers/us-rse-escience-2022 +cd perf_test/blas/blas3 +export KOKKOS_SRC_DIR=/path/to/kokkos +export KOKKOSKERNELS_SRC_DIR=/path/to/kokkos-kernels +``` + +### On V100 +```bash +./KokkosBatched_BatchedGemm_benchmark.sh double SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh float SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh half SNB VOLTA70 +./KokkosBatched_BatchedGemm_benchmark.sh bhalf SNB VOLTA70 +``` + +### On A100 +```bash +./KokkosBatched_BatchedGemm_benchmark.sh double DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh float DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh half DEFAULT AMPERE80 +./KokkosBatched_BatchedGemm_benchmark.sh bhalf DEFAULT AMPERE80 +``` \ No newline at end of file diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index 8b16111157..8a97d77a38 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -55,6 +55,7 @@ #include "KokkosKernels_TestParameters.hpp" #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) { @@ -376,16 +377,14 @@ void run_multi_mem_experiment(Parameters params) { if (params.a_mem_space == 1) { fast_crstmat_t a_fast_crsmat; a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); a_fast_crsgraph = a_fast_crsmat.graph; num_cols = a_fast_crsmat.numCols(); } else { slow_crstmat_t a_slow_crsmat; a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); a_slow_crsgraph = a_slow_crsmat.graph; num_cols = a_slow_crsmat.numCols(); } @@ -537,7 +536,9 @@ int main(int argc, char **argv) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = std::max(params.use_cuda, params.use_hip) - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/graph/KokkosGraph_color_d2.cpp b/perf_test/graph/KokkosGraph_color_d2.cpp index 7d6f45889a..b824ced38a 100644 --- a/perf_test/graph/KokkosGraph_color_d2.cpp +++ b/perf_test/graph/KokkosGraph_color_d2.cpp @@ -65,6 +65,7 @@ #include #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -595,7 +596,7 @@ void experiment_driver(const D2Parameters& params) { using graph_t = typename crsMat_t::StaticCrsGraphType; crsMat_t A = - KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); graph_t Agraph = A.graph; int num_cols = A.numCols(); @@ -631,7 +632,9 @@ int main(int argc, char* argv[]) { device_id = params.use_cuda - 1; else if (params.use_hip) device_id = params.use_hip - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Print out verbose information about the configuration of the run. // Kokkos::print_configuration(std::cout); diff --git a/perf_test/graph/KokkosGraph_mis_d2.cpp b/perf_test/graph/KokkosGraph_mis_d2.cpp index c68d5f85e2..df5e28b315 100644 --- a/perf_test/graph/KokkosGraph_mis_d2.cpp +++ b/perf_test/graph/KokkosGraph_mis_d2.cpp @@ -66,6 +66,7 @@ #include "KokkosGraph_MIS2.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" using namespace KokkosGraph; @@ -253,11 +254,11 @@ void run_mis2(const MIS2Parameters& params) { Kokkos::Timer t; crsMat_t A_in = - KokkosKernels::Impl::read_kokkos_crst_matrix(params.mtx_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(params.mtx_file); std::cout << "I/O time: " << t.seconds() << " s\n"; t.reset(); // Symmetrize the matrix just in case - crsMat_t At_in = KokkosKernels::Impl::transpose_matrix(A_in); + crsMat_t At_in = KokkosSparse::Impl::transpose_matrix(A_in); crsMat_t A; KKH kkh; const default_scalar one = Kokkos::ArithTraits::one(); diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 2fee139a64..0a189cd3e1 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -117,9 +117,7 @@ struct Flush { void init(value_type &update) { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &update, const volatile value_type &input) { - update += input; - } + void join(value_type &update, const value_type &input) { update += input; } KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &update) const { update += _buf[i]; } diff --git a/perf_test/graph/KokkosGraph_triangle.cpp b/perf_test/graph/KokkosGraph_triangle.cpp index 17e4a08de4..be0b57492a 100644 --- a/perf_test/graph/KokkosGraph_triangle.cpp +++ b/perf_test/graph/KokkosGraph_triangle.cpp @@ -296,12 +296,14 @@ int main(int argc, char **argv) { params.use_openmp; // Assumption is that use_openmp variable is provided // as number of threads const int device_id = 0; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); #if defined(KOKKOS_ENABLE_OPENMP) if (params.use_openmp) { - Kokkos::OpenMP::print_configuration(std::cout); + Kokkos::OpenMP().print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::OpenMP, Kokkos::OpenMP::memory_space, @@ -317,7 +319,7 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_CUDA) if (params.use_cuda) { - Kokkos::Cuda::print_configuration(std::cout); + Kokkos::Cuda().print_configuration(std::cout); #ifdef KOKKOSKERNELS_MULTI_MEM KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::Cuda, Kokkos::Cuda::memory_space, @@ -333,7 +335,7 @@ int main(int argc, char **argv) { #if defined(KOKKOS_ENABLE_HIP) if (params.use_hip) { - Kokkos::Experimental::HIP::print_configuration(std::cout); + Kokkos::Experimental::HIP().print_configuration(std::cout); KokkosKernels::Experiment::run_multi_mem_triangle< size_type, idx, Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); diff --git a/perf_test/sparse/KokkosSparse_block_pcg.cpp b/perf_test/sparse/KokkosSparse_block_pcg.cpp index 89ab0bfdca..3628eac956 100644 --- a/perf_test/sparse/KokkosSparse_block_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_block_pcg.cpp @@ -50,7 +50,7 @@ #include "KokkosSparse_pcg.hpp" #include "KokkosKernels_Utils.hpp" -#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -75,7 +75,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { if (std::string(mtx_bin_file) == "auto") { INDEX_TYPE num_rows = 11, num_cols = 11, nnz = 40; - crsmat = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsmat = KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(num_rows, num_cols, nnz, 3, 5); printf("generating test matrix automatically\n"); printf(" num rows: %d", num_rows); @@ -86,7 +86,7 @@ crsMat_t create_crs_matrix(char *mtx_bin_file) { INDEX_TYPE *xadj, *adj; SCALAR_TYPE *ew; - KokkosKernels::Impl::read_matrix( + KokkosSparse::Impl::read_matrix( &nv, &ne, &xadj, &adj, &ew, mtx_bin_file); row_map_view_t rowmap_view("rowmap_view", nv + 1); @@ -322,7 +322,7 @@ void run_experiment( // typedef typename lno_nnz_view_t::value_type lno_t; // typedef typename lno_view_t::value_type size_type; // typedef typename scalar_view_t::value_type scalar_t; - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); @@ -349,7 +349,7 @@ void run_experiment( scalar_view_t bf_v; size_t but_r, but_c; - KokkosKernels::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs( + KokkosSparse::Impl::kk_create_blockcrs_from_blockcrs_formatted_point_crs( block_size, out_r, out_c, pf_rm, pf_e, pf_v, but_r, but_c, bf_rm, bf_e, bf_v); @@ -381,7 +381,7 @@ int main(int argc, char **argv) { int cmdline[CMD_COUNT]; char *mtx_bin_file = NULL; int block_size = 5; - struct Kokkos::InitArguments kargs; + Kokkos::InitializationSettings kargs; for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0; @@ -389,9 +389,11 @@ int main(int argc, char **argv) { if (0 == Test::string_compare_no_case(argv[i], "--serial")) { cmdline[CMD_USE_SERIAL] = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - kargs.num_threads = cmdline[CMD_USE_THREADS] = atoi(argv[++i]); + cmdline[CMD_USE_THREADS] = atoi(argv[++i]); + kargs.set_num_threads(cmdline[CMD_USE_THREADS]); } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - kargs.num_threads = cmdline[CMD_USE_OPENMP] = atoi(argv[++i]); + cmdline[CMD_USE_OPENMP] = atoi(argv[++i]); + kargs.set_num_threads(cmdline[CMD_USE_OPENMP]); } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { cmdline[CMD_USE_CUDA] = 1; } else if (0 == Test::string_compare_no_case(argv[i], "--mtx")) { @@ -435,7 +437,7 @@ int main(int argc, char **argv) { if (cmdline[CMD_USE_SERIAL]) { using myExecSpace = Kokkos::Serial; - Kokkos::Serial::print_configuration(std::cout); + myExecSpace().print_configuration(std::cout); using crsMat_t = typename KokkosSparse::CrsMatrix #include #include "KokkosKernels_default_types.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #include #include @@ -177,7 +178,7 @@ crsMat_t generateLongRowMatrix(const GS_Parameters& params) { rowmap.data(), numRows + 1)); crsMat_t A("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); - A = KokkosKernels::sort_and_merge_matrix(A); + A = KokkosSparse::sort_and_merge_matrix(A); if (params.graph_symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those // can be tested for symmetric=false) @@ -203,7 +204,7 @@ void runGS(const GS_Parameters& params) { typedef typename crsMat_t::values_type::non_const_type scalar_view_t; crsMat_t A; if (params.matrix_path) - A = KokkosKernels::Impl::read_kokkos_crst_matrix( + A = KokkosSparse::Impl::read_kokkos_crst_matrix( params.matrix_path); else A = generateLongRowMatrix(params); diff --git a/perf_test/sparse/KokkosSparse_kk_spmv.cpp b/perf_test/sparse/KokkosSparse_kk_spmv.cpp index 953294b120..40887d67ec 100644 --- a/perf_test/sparse/KokkosSparse_kk_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_kk_spmv.cpp @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "KokkosKernels_default_types.hpp" @@ -74,11 +75,11 @@ void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop, srand(17312837); matrix_type A; if (filename) - A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); else { Offset nnz = 10 * numRows; // note: the help text says the bandwidth is fixed at 0.01 * numRows - A = KokkosKernels::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 0, 0.01 * numRows); } numRows = A.numRows(); diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp index 371f1b1d33..d7ae6da430 100644 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp @@ -44,6 +44,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_run_spgemm.hpp" +#include "KokkosSparse_IOUtils.hpp" namespace KokkosKernels { @@ -74,12 +75,10 @@ void run_multi_mem_spgemm(Parameters params) { if (params.a_mem_space == 1) { a_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } else { a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && @@ -90,13 +89,11 @@ void run_multi_mem_spgemm(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } if (params.a_mem_space == 1) { @@ -222,18 +219,18 @@ void run_multi_mem_spgemm(Parameters params) { if (c_mat_file != NULL) { if (params.c_mem_space == 1) { - KokkosKernels::sort_crs_matrix(c_fast_crsmat); + KokkosSparse::sort_crs_matrix(c_fast_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)(c_fast_crsmat.numRows()), (size_type)(c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), c_mat_file); } else { - KokkosKernels::sort_crs_matrix(c_slow_crsmat); + KokkosSparse::sort_crs_matrix(c_slow_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)c_slow_crsmat.numRows(), (size_type)c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), diff --git a/perf_test/sparse/KokkosSparse_pcg.cpp b/perf_test/sparse/KokkosSparse_pcg.cpp index 5f34ec1cd9..51c2cbb01b 100644 --- a/perf_test/sparse/KokkosSparse_pcg.cpp +++ b/perf_test/sparse/KokkosSparse_pcg.cpp @@ -49,6 +49,7 @@ #include "KokkosKernels_IOUtils.hpp" #include "KokkosKernels_default_types.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" #include #define MAXVAL 1 @@ -263,9 +264,8 @@ void run_pcg(int *cmdline, const char *mtx_file) { default_lno_t *xadj, *adj; default_scalar *ew; - KokkosKernels::Impl::read_matrix(&nv, &ne, &xadj, &adj, &ew, - mtx_file); + KokkosSparse::Impl::read_matrix( + &nv, &ne, &xadj, &adj, &ew, mtx_file); typedef typename KokkosSparse::CrsMatrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } else { a_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - a_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); } if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && @@ -353,13 +352,11 @@ void run_spgemm_jacobi(Parameters params) { } else if (params.b_mem_space == 1) { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_fast_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } else { if (b_mat_file == NULL) b_mat_file = a_mat_file; b_slow_crsmat = - KokkosKernels::Impl::read_kokkos_crst_matrix( - b_mat_file); + KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); } if (params.a_mem_space == 1) { @@ -485,18 +482,18 @@ void run_spgemm_jacobi(Parameters params) { if (c_mat_file != NULL) { if (params.c_mem_space == 1) { - KokkosKernels::sort_crs_matrix(c_fast_crsmat); + KokkosSparse::sort_crs_matrix(c_fast_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)(c_fast_crsmat.numRows()), (size_type)(c_fast_crsmat.graph.entries.extent(0)), c_fast_crsmat.graph.row_map.data(), c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), c_mat_file); } else { - KokkosKernels::sort_crs_matrix(c_slow_crsmat); + KokkosSparse::sort_crs_matrix(c_slow_crsmat); - KokkosKernels::Impl::write_graph_bin( + KokkosSparse::Impl::write_graph_bin( (lno_t)c_slow_crsmat.numRows(), (size_type)c_slow_crsmat.graph.entries.extent(0), c_slow_crsmat.graph.row_map.data(), diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 7b0bd42d2a..5448843168 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -45,8 +45,9 @@ #include #include "KokkosKernels_config.h" #include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_Utils_mkl.hpp" #include "KokkosSparse_spadd.hpp" #include "KokkosKernels_TestUtils.hpp" @@ -57,21 +58,6 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include #include - -inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus, - const char* name, - const char* file = nullptr, - const int line = 0) { - if (SPARSE_STATUS_SUCCESS != mklStatus) { - std::ostringstream oss; - oss << "MKL call \"" << name << "\" encountered error at " << file << ":" - << line << '\n'; - Kokkos::abort(oss.str().c_str()); - } -} - -#define SPADD_MKL_SAFE_CALL(call) \ - spadd_mkl_internal_safe_call(call, #call, __FILE__, __LINE__) #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) && \ @@ -125,19 +111,19 @@ void run_experiment(const Params& params) { lno_t n = params.n; if (params.amtx.length()) { std::cout << "Loading A from " << params.amtx << '\n'; - A = KokkosKernels::Impl::read_kokkos_crst_matrix( + A = KokkosSparse::Impl::read_kokkos_crst_matrix( params.amtx.c_str()); m = A.numRows(); n = A.numCols(); } else { std::cout << "Randomly generating A\n"; size_type nnzUnused = m * params.nnzPerRow; - A = KokkosKernels::Impl::kk_generate_sparse_matrix( - m, n, nnzUnused, 0, (n + 3) / 3); + A = KokkosSparse::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, + 0, (n + 3) / 3); } if (params.bmtx.length()) { std::cout << "Loading B from " << params.bmtx << '\n'; - B = KokkosKernels::Impl::read_kokkos_crst_matrix( + B = KokkosSparse::Impl::read_kokkos_crst_matrix( params.bmtx.c_str()); } else if (params.bDiag) { std::cout << "Generating B as diagonal matrix.\n"; @@ -168,8 +154,8 @@ void run_experiment(const Params& params) { } else { std::cout << "Randomly generating B\n"; size_type nnzUnused = m * params.nnzPerRow; - B = KokkosKernels::Impl::kk_generate_sparse_matrix( - m, n, nnzUnused, 0, (n + 3) / 3); + B = KokkosSparse::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, + 0, (n + 3) / 3); } // Make sure dimensions are compatible if (A.numRows() != B.numRows() || A.numCols() != B.numCols()) { @@ -200,8 +186,8 @@ void run_experiment(const Params& params) { if (params.sorted) { std::cout << "Assuming input matrices are sorted (explicitly sorting just " "in case)\n"; - KokkosKernels::sort_crs_matrix(A); - KokkosKernels::sort_crs_matrix(B); + KokkosSparse::sort_crs_matrix(A); + KokkosSparse::sort_crs_matrix(B); } else std::cout << "Assuming input matrices are not sorted.\n"; kh.create_spadd_handle(params.sorted); @@ -259,11 +245,11 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if (params.use_mkl) { - SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Amkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)A.graph.row_map.data(), (int*)A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())); - SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, (int*)B.graph.row_map.data(), (int*)B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); @@ -326,9 +312,9 @@ void run_experiment(const Params& params) { #endif } else if (params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, - Amkl, 1.0, Bmkl, &Cmkl)); - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_add( + SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); #endif } else { spadd_numeric( @@ -351,8 +337,8 @@ void run_experiment(const Params& params) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if (params.use_mkl) { - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); - SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); } #endif @@ -377,8 +363,8 @@ void run_experiment(const Params& params) { std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx << "\n"; crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC); - KokkosKernels::Impl::write_kokkos_crst_matrix( - C, params.cmtx.c_str()); + KokkosSparse::Impl::write_kokkos_crst_matrix(C, + params.cmtx.c_str()); } } @@ -490,7 +476,9 @@ int main(int argc, char** argv) { // as number of threads const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); // Kokkos::print_configuration(std::cout); // First, make sure that requested TPL (if any) is actually available diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index 9fada4caaa..da705fcdf2 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -294,7 +294,9 @@ int main(int argc, char** argv) { const int device_id = params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index 98942acb27..aa3969e6c8 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -259,7 +259,9 @@ int main(int argc, char** argv) { const int num_threads = std::max(params.use_openmp, params.use_threads); const int device_id = params.use_cuda - 1; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); Kokkos::print_configuration(std::cout); #if defined(KOKKOS_ENABLE_OPENMP) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index d381b9b888..b86ecc352f 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -58,13 +58,14 @@ #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spiluk.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBlas1_nrm2.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_default_types.hpp" #include +#include #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \ (!defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION)) @@ -111,7 +112,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, if (!afilename.empty()) { std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl; - crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t A = KokkosSparse::Impl::read_kokkos_crst_matrix( afilename.c_str()); // in_matrix graph_t graph = A.graph; // in_graph const size_type nrows = graph.numRows(); @@ -257,6 +258,10 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { std::cout << "CUSPARSE: No KK interface added yet" << std::endl; @@ -412,6 +417,7 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, } // end row std::cout << "ILU(0) SUCCESS!" << std::endl; } // fill_lev=0 +#endif #endif // Benchmark @@ -436,6 +442,10 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) if (fill_lev == 0) { lno_view_t A_row_map("A_row_map", nrows + 1); lno_nnz_view_t A_entries("A_entries", nnz); @@ -465,15 +475,21 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int kin, std::cout << "LOOP_MAX_TIME (cuSPARSE): " << max_time << std::endl; std::cout << "LOOP_MIN_TIME (cuSPARSE): " << min_time << std::endl; } // fill_lev=0 +#endif #endif } // end tests #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE requires lno_t = size_type = int. For both, int is always used + // (if enabled) +#if defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT) // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrilu02Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); +#endif #endif } // end if (!afilename.empty()) diff --git a/perf_test/sparse/KokkosSparse_spmv.cpp b/perf_test/sparse/KokkosSparse_spmv.cpp index 6b67905adc..9eec6181a7 100644 --- a/perf_test/sparse/KokkosSparse_spmv.cpp +++ b/perf_test/sparse/KokkosSparse_spmv.cpp @@ -55,6 +55,7 @@ #include #include #include +#include #include #include "KokkosKernels_default_types.hpp" #include @@ -90,12 +91,12 @@ int test_crs_matrix_singlevec(Ordinal numRows, Ordinal numCols, int test, srand(17312837); matrix_type A; if (filename) - A = KokkosKernels::Impl::read_kokkos_crst_matrix(filename); + A = KokkosSparse::Impl::read_kokkos_crst_matrix(filename); else { Offset nnz = 10 * numRows; // note: the help text says the bandwidth is fixed at 0.01 * numRows // CAVEAT: small problem sizes are problematic, b/c of 0.01*numRows - A = KokkosKernels::Impl::kk_generate_sparse_matrix( + A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, 0, 0.01 * numRows); } SPMVTestData test_data = setup_test(&data, A, rows_per_thread, team_size, diff --git a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp index ca16f2067e..c578c269f8 100644 --- a/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_struct_tuning.cpp @@ -603,9 +603,13 @@ int main(int argc, char** argv) { &vecY, y1.extent_int(0), (void*)y1.data(), myCudaDataType)); const double alpha = 1.0, beta = 1.0; - size_t bufferSize = 0; - void* dBuffer = NULL; + size_t bufferSize = 0; + void* dBuffer = NULL; +#if CUSPARSE_VERSION >= 11201 + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; +#endif KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( controls.getCusparseHandle(), CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, A_cusparse, vecX, &beta, vecY, myCudaDataType, alg, diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp index c6787242d9..a27ed3f6d2 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -58,12 +58,13 @@ #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_sptrsv.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_default_types.hpp" #include +#include "KokkosSparse_IOUtils.hpp" //#define INTERNAL_CUSPARSE @@ -159,7 +160,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, if (!lfilename.empty()) { std::cout << "Lower Tri Begin: Read matrix filename " << lfilename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix( lfilename.c_str()); // in_matrix graph_t graph = triMtx.graph; // in_graph const size_type nrows = graph.numRows(); @@ -567,7 +568,7 @@ int test_sptrsv_perf(std::vector tests, const std::string &lfilename, if (!ufilename.empty()) { std::cout << "Upper Tri Begin: Read matrix filename " << ufilename << std::endl; - crsmat_t triMtx = KokkosKernels::Impl::read_kokkos_crst_matrix( + crsmat_t triMtx = KokkosSparse::Impl::read_kokkos_crst_matrix( ufilename.c_str()); // in_matrix graph_t graph = triMtx.graph; // in_graph const size_type nrows = graph.numRows(); diff --git a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp index 039c88e9c1..b77f0b1d07 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv_supernode.cpp @@ -43,9 +43,10 @@ */ #include "Kokkos_Random.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_sptrsv.hpp" #include "KokkosSparse_sptrsv_supernode.hpp" @@ -58,12 +59,8 @@ #include "KokkosSparse_sptrsv_aux.hpp" -using namespace KokkosKernels; -using namespace KokkosKernels::Impl; -using namespace KokkosKernels::Experimental; -using namespace KokkosSparse; -using namespace KokkosSparse::Experimental; -using namespace KokkosSparse::PerfTest::Experimental; +namespace KSExp = KokkosSparse::Experimental; +namespace KSPTE = KokkosSparse::PerfTest::Experimental; enum { CUSPARSE, @@ -130,7 +127,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, std::cout << " > Read a triangular-matrix filename " << matrix_filename << std::endl; host_crsmat_t M = - KokkosKernels::Impl::read_kokkos_crst_matrix( + KokkosSparse::Impl::read_kokkos_crst_matrix( matrix_filename.c_str()); const size_type nrows = M.graph.numRows(); // transpose the matrix to be stored in CCS @@ -153,10 +150,10 @@ int test_sptrsv_perf(std::vector tests, bool verbose, cols_view_t entries("colmap_view", nnzL); values_view_t values("values_view", nnzL); // transpose L - transpose_matrix(nrows, nrows, row_mapM, entriesM, - valuesM, row_map, entries, values); + KokkosSparse::Impl::transpose_matrix< + in_row_map_view_t, in_cols_view_t, in_values_view_t, row_map_view_t, + cols_view_t, values_view_t, row_map_view_t, host_execution_space>( + nrows, nrows, row_mapM, entriesM, valuesM, row_map, entries, values); // store L in CSC host_graph_t static_graph(entries, row_map); @@ -211,24 +208,24 @@ int test_sptrsv_perf(std::vector tests, bool verbose, if (test == SUPERNODAL_NAIVE) { std::cout << " > create handle for SUPERNODAL_NAIVE" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, - true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_NAIVE, nrows, - true); + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, + nrows, true); + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_NAIVE, + nrows, true); } else if (test == SUPERNODAL_DAG) { std::cout << " > create handle for SUPERNODAL_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_DAG, nrows, - true); + khL.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, + nrows, true); + khU.create_sptrsv_handle(KSExp::SPTRSVAlgorithm::SUPERNODAL_DAG, + nrows, true); } else if (test == SUPERNODAL_SPMV_DAG) { std::cout << " > create handle for SUPERNODAL_SPMV_DAG" << std::endl << std::endl; - khL.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, - nrows, true); - khU.create_sptrsv_handle(SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, - nrows, true); + khL.create_sptrsv_handle( + KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); + khU.create_sptrsv_handle( + KSExp::SPTRSVAlgorithm::SUPERNODAL_SPMV_DAG, nrows, true); } // verbose (optional, default is false) khL.set_sptrsv_verbose(verbose); @@ -253,13 +250,13 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // graph/dag) khU.get_sptrsv_handle()->set_column_major( !khL.get_sptrsv_handle()->is_column_major()); - sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, L.graph, - &khL, L.graph, &khU); + KSExp::sptrsv_supernodal_symbolic(nsuper, supercols.data(), etree, + L.graph, &khL, L.graph, &khU); // ============================================== // do numeric compute (copy numerical values from SuperLU data // structure to our sptrsv data structure) - sptrsv_compute(&khL, L); + KSExp::sptrsv_compute(&khL, L); // ============================================== // Preaparing for the first solve @@ -283,7 +280,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // ============================================== // do L solve timer.reset(); - sptrsv_solve(&khL, sol, rhs); + KSExp::sptrsv_solve(&khL, sol, rhs); Kokkos::fence(); std::cout << " > Lower-TRI: " << std::endl; std::cout << " Solve Time : " << timer.seconds() << std::endl; @@ -295,7 +292,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, // Error Check ** on host ** Kokkos::fence(); std::cout << std::endl; - if (!check_errors(tol, A, rhs_host, sol_host)) { + if (!KSPTE::check_errors(tol, A, rhs_host, sol_host)) { num_failed++; } @@ -307,7 +304,7 @@ int test_sptrsv_perf(std::vector tests, bool verbose, Kokkos::fence(); for (int i = 0; i < loop; i++) { timer.reset(); - sptrsv_solve(&khL, sol, rhs); + KSExp::sptrsv_solve(&khL, sol, rhs); Kokkos::fence(); double time = timer.seconds(); ave_time += time; diff --git a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp index c32968c177..3a631fc743 100644 --- a/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp +++ b/perf_test/sparse/spmv/Kokkos_SPMV_Inspector.hpp @@ -143,7 +143,7 @@ void kk_inspector_matvec(AType A, XType x, YType y, int team_size, workset_offsets(0) = 0; lno_t ws = 1; for (lno_t row = 0; row < A.numRows(); row++) { - if (A.graph.row_map(row) > ws * nnz_per_workset) { + if (A.graph.row_map(row) > size_type(ws) * nnz_per_workset) { workset_offsets(ws) = row; ws++; } diff --git a/scripts/cm_test_all_sandia b/scripts/cm_test_all_sandia index c049e6b721..db7289619d 100755 --- a/scripts/cm_test_all_sandia +++ b/scripts/cm_test_all_sandia @@ -20,6 +20,7 @@ print_help() { echo "--spack: Run spack builds rather than direct CMake tests" echo "" echo "--debug: Run tests in debug. Defaults to False" + echo "--deprecated-code: Enable deprecated code (disabled by default)" echo "--boundscheck: Enable Kokkos_ENABLE_DEBUG_BOUNDS_CHECK to check View accesses within bounds." echo "--test-script: Test this script, not Kokkos" echo "--skip-hwloc: Do not do hwloc tests" @@ -50,8 +51,8 @@ print_help() { echo "--build-list=BUILD,BUILD,BUILD..." echo " Provide a comma-separated list of builds instead of running all builds" echo " Valid items:" - echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" - echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" + echo " OpenMP, Threads, Serial, OpenMP_Serial, Threads_Serial" + echo " Cuda_OpenMP, Cuda_Threads, Cuda_Serial" echo "" echo "--with-scalars=SCALARS: set KOKKOSKERNELS_SCALARS" echo " Provide a comma-separated list scalar types" @@ -183,12 +184,12 @@ fi echo "Running on machine: $MACHINE" -GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" +GCC_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" ARM_GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" -INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" -CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" -CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" +INTEL_BUILD_LIST="OpenMP,Threads,Serial,OpenMP_Serial,Threads_Serial" +CLANG_BUILD_LIST="Threads,Serial,Threads_Serial" +CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Threads,Cuda_Serial" CUDA_IBM_BUILD_LIST="Cuda_OpenMP,Cuda_Serial" GCC_WARNING_FLAGS="-Wall,-Wunused-parameter,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" @@ -266,6 +267,9 @@ KOKKOSKERNELS_OFFSETS="int,size_t" KOKKOSKERNELS_LAYOUTS="LayoutLeft" CTESTTIMEOUT=2500 + +KOKKOS_DEPRECATED_CODE="" + # # Handle arguments. # @@ -290,6 +294,9 @@ do --boundscheck*) KOKKOS_BOUNDS_CHECK="--boundscheck" ;; + --deprecated-code) + KOKKOS_DEPRECATED_CODE="--deprecated-code" + ;; --build-only*) BUILD_ONLY=True ;; @@ -526,7 +533,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -535,7 +542,7 @@ elif [ "$MACHINE" = "kokkos-dev" ]; then COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" "gcc/7.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "clang/5.0.1 $CLANG_BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" "clang/7.0.1 $CLANG7_MODULE_LIST "Cuda_OpenMP" clang++ $CLANG_WARNING_FLAGS" "cuda/9.2 $CUDA9_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -564,9 +571,9 @@ elif [ "$MACHINE" = "inouye" ]; then MODULE_ENVIRONMENT="module purge" eval "$MODULE_ENVIRONMENT" SKIP_HWLOC=True - export OMP_PROC_BIND=close - export OMP_PLACES=cores - export OMP_NUM_THREADS=48 + export omp_proc_bind=close + export omp_places=cores + export omp_num_threads=47 BASE_MODULE_LIST="cmake/3.17.0,/" @@ -620,7 +627,7 @@ elif [ "$MACHINE" = "white" ]; then CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.4.0,openblas/0.3.4/gcc/7.4.0" IBM_MODULE_TPL_LIST="cmake/3.19.3,/xl/,gcc/7.2.0,netlib/3.8.0/ibm/xl/16.1.1" - # Don't do pthread on white. + # Don't do Threads on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" # Don't run the IBM toolchain with CXX14 on white @@ -672,13 +679,15 @@ elif [ "$MACHINE" = "weaver" ]; then GCC74_MODULE_TPL_LIST="cmake/3.19.3,/,openblas/0.2.20/gcc/7.2.0,gcc/7.4.0" CUDA_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,netlib/3.8.0/gcc/7.2.0" CUDA10_MODULE_TPL_LIST="cmake/3.19.3,/,gcc/7.2.0,openblas/0.2.20/gcc/7.2.0" + # Cuda/11 modules available only on the dev queue (rhel8 OS); gcc/8.3.1 load by default + CUDA11_MODULE_LIST="cmake/3.21.2,/,openblas/0.3.18/gcc/8.3.1" # Issues finding CUBLAS with cuda/10.1.243 module at configure # "Could NOT find TPLCUBLAS (missing: CUDA_CUBLAS_LIBRARIES)" # Once resolved add the compiler + modules below to the SPOT_CHECK_TPLS # "cuda/10.1.243 $CUDA10_MODULE_TPL_LIST "Cuda_OpenMP" ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" - # Don't do pthread on weaver + # Don't do Threads on weaver GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" if [ "$SPOT_CHECK" = "True" ]; then @@ -707,6 +716,8 @@ elif [ "$MACHINE" = "weaver" ]; then "cuda/10.1.105 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.1.243 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/10.2.089 $CUDA10_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/10.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/11.2.2 $CUDA11_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) fi @@ -756,6 +767,8 @@ elif [ "$MACHINE" = "caraway" ]; then # output description and success based only on build succes; build time output (no run-time) BASE_MODULE_LIST="cmake/3.19.3,/" + # Cuda11 usage available on the V100 queue + CUDA11_MODULE_LIST="cmake/3.22.2,/,gcc/8.2.0" HIPCLANG_BUILD_LIST="Hip_Serial" HIPCLANG_WARNING_FLAGS="" @@ -763,6 +776,12 @@ elif [ "$MACHINE" = "caraway" ]; then # Format: (compiler module-list build-list exe-name warning-flag) COMPILERS=("rocm/4.3.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" "rocm/4.5.0 $BASE_MODULE_LIST $HIPCLANG_BUILD_LIST hipcc $HIPCLANG_WARNING_FLAGS" + "cuda/11.4 $CUDA11_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/8.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/9.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/10.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/11.2.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -789,14 +808,14 @@ elif [ "$MACHINE" = "blake" ]; then #"intel/18.1.163 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" COMPILERS=("intel/19.1.144 $BASE_MODULE_LIST_INTEL "OpenMP_Serial" icpc $INTEL_WARNING_FLAGS" - "gcc/7.2.0 $BASE_MODULE_LIST "Pthread_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" - "clang/10.0.1 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" + "gcc/7.2.0 $BASE_MODULE_LIST "Threads_Serial,OpenMP" g++ $GCC_WARNING_FLAGS" + "clang/10.0.1 $BASE_MODULE_LIST "Threads_Serial" clang++ $CLANG_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) # TODO: Failing toolchains: #"pgi/18.7.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" - COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Pthread" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/18.1.163 $BASE_MODULE_LIST_INTEL "OpenMP,Threads" icpc $INTEL_WARNING_FLAGS" "gcc/7.2.0 $GCC72_MODULE_TPL_LIST "OpenMP_Serial" g++ $GCC_WARNING_FLAGS" ) else @@ -845,36 +864,36 @@ elif [ "$MACHINE" = "kokkos-dev-2" ]; then CLANG8_MODULE_LIST="sems-archive-env,sems-archive-cmake/3.17.1,sems-archive-gcc/7.2.0,/,cuda/10.0" - BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Pthread" + BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_Threads" BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_OpenMP" - BUILD_LIST_CLANG="Serial,Pthread,OpenMP" + BUILD_LIST_CLANG="Serial,Threads,OpenMP" CLANG8_CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized,-Wno-pass-failed" if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" + "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/9.2 $NVCC_SEMSMODULE_LIST "Cuda_Serial" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) elif [ "$SPOT_CHECK_TPLS" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/7.3.0 $BASE_MODULE_LIST "OpenMP,Threads" g++ $GCC_WARNING_FLAGS" "gcc/8.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "gcc/9.1 $GCC91_MODULE_LIST "OpenMP,Serial" g++ $GCC_WARNING_FLAGS" "intel/17.0.1 $OLDINTEL_BASE_MODULE_LIST "Serial" icpc $INTEL_WARNING_FLAGS" "intel/18.0.5 $OLDINTEL_BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" - "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Pthread" icpc $INTEL_WARNING_FLAGS" - "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Pthread_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" - "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Pthread" clang++ $CLANG_WARNING_FLAGS" + "intel/19.0.5 $INTEL_BASE_MODULE_LIST "Threads" icpc $INTEL_WARNING_FLAGS" + "clang/8.0 $CLANG8_MODULE_LIST "Cuda_OpenMP,Threads_Serial" clang++ $CLANG8_CUDA_WARNING_FLAGS" + "clang/9.0.0 $CLANG_BASE_MODULE_LIST "Serial,Threads" clang++ $CLANG_WARNING_FLAGS" "cuda/10.1 $NVCC_SEMSMODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "cuda/11.0 $NVCC11_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) @@ -916,9 +935,9 @@ else exit 1 fi -export OMP_NUM_THREADS=8 -export OMP_PROC_BIND=spread -export OMP_PLACES=cores +export OMP_NUM_THREADS=${omp_num_threads:=8} +export OMP_PROC_BIND=${omp_proc_bind:=spread} +export OMP_PLACES=${omp_places:=cores} declare -i NUM_RESULTS_TO_KEEP=7 @@ -1318,13 +1337,13 @@ single_build_and_test() { # KOKKOS_OPTIONS and KOKKOS_CUDA_OPTIONS are exported and detected by kokkos' generate_makefile.sh during install of kokkos; we pass them to the reproducer script instructions echo " # Use generate_makefile line below to call cmake which generates makefile for this build:" &> call_generate_makefile.sh - echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &>> call_generate_makefile.sh + echo " ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &>> call_generate_makefile.sh chmod +x call_generate_makefile.sh # script command with generic path for faster copy/paste of reproducer into issues - echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args" &> call_generate_makefile_genericpath.sh + echo " # \$KOKKOSKERNELS_PATH/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=\$KOKKOS_PATH --kokkoskernels-path=\$KOKKOSKERNELS_PATH --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} --with-options=${KOKKOS_OPTIONS} --with-cuda-options=${KOKKOS_CUDA_OPTIONS} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args" &> call_generate_makefile_genericpath.sh - run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOSKERNELS_PATH}/cm_generate_makefile.bash --with-devices=$LOCAL_KOKKOS_DEVICES $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --cxxstandard=\"$cxx_standard\" --ldflags=\"$ldflags\" $CUDA_ENABLE_CMD $HIP_ENABLE_CMD --kokkos-path=${KOKKOS_PATH} --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=$kk_scalars --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --with-layouts=${KOKKOSKERNELS_LAYOUTS} ${KOKKOSKERNELS_ENABLE_TPL_CMD} ${KOKKOSKERNELS_TPL_PATH_CMD} ${KOKKOSKERNELS_TPL_LIBS_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOS_BOUNDS_CHECK} ${KOKKOSKERNELS_SPACES} --no-examples ${KOKKOS_DEPRECATED_CODE} $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local make_par_lvl=12 if [[ "$MACHINE" = white* ]]; then diff --git a/scripts/docker/Dockerfile.sycl b/scripts/docker/Dockerfile.sycl index 888a36d510..3d94a1a45e 100644 --- a/scripts/docker/Dockerfile.sycl +++ b/scripts/docker/Dockerfile.sycl @@ -1,6 +1,8 @@ ARG BASE=nvidia/cuda:10.2-devel FROM $BASE +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub + RUN apt-get update && apt-get install -y \ bc \ wget \ @@ -36,8 +38,8 @@ RUN CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSIO ENV PATH=${CMAKE_DIR}/bin:$PATH ENV SYCL_DIR=/opt/sycl -RUN SYCL_VERSION=2021-09 && \ - SYCL_URL=https://github.com/intel/llvm/archive && \ +RUN SYCL_VERSION=20220112 && \ + SYCL_URL=https://github.com/intel/llvm/archive/sycl-nightly && \ SYCL_ARCHIVE=${SYCL_VERSION}.tar.gz && \ SCRATCH_DIR=/scratch && mkdir -p ${SCRATCH_DIR} && cd ${SCRATCH_DIR} && \ wget --quiet ${SYCL_URL}/${SYCL_ARCHIVE} && \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a3460d1413..a1c938aed5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -52,7 +52,7 @@ IF (KOKKOSKERNELS_ENABLE_TPL_BLAS OR KOKKOSKERNELS_ENABLE_TPL_MKL) APPEND_GLOB(SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/impl/tpls/KokkosBlas_Host_tpl.cpp) ENDIF() -include(kokkoskernels_eti.cmake) +include(cmake/kokkoskernels_eti.cmake) SET(ETI_HEADERS) #Build up a list of DECL, AVAIL, and INST macros @@ -360,6 +360,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_numeric spgemm_numeric TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_bspgemm_numeric bspgemm_numeric + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi COMPONENTS sparse HEADER_LIST ETI_HEADERS @@ -367,6 +374,22 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spgemm_jacobi spgemm_jacobi TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +# NOTE: SpAdd symbolic doesn't use scalars directly, +# but it needs the type to use handles. +KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_symbolic spadd_symbolic + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Sparse_spadd_numeric spadd_numeric + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spiluk_symbolic spiluk_symbolic COMPONENTS sparse HEADER_LIST ETI_HEADERS @@ -416,6 +439,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_gauss_seidel_apply gauss_seidel_apply TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE ) +KOKKOSKERNELS_GENERATE_ETI(Graph_color_d1 color_d1 + COMPONENTS graph + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES_W_SLOW_SPACE +) + LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h) #Add a few other utility files diff --git a/src/KokkosKernels_Macros.hpp b/src/KokkosKernels_Macros.hpp index 1630028c54..67d86b6e0e 100644 --- a/src/KokkosKernels_Macros.hpp +++ b/src/KokkosKernels_Macros.hpp @@ -66,9 +66,10 @@ // https://clang.llvm.org/docs/OpenMPSupport.html#id1 #if defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) // GCC 4.8.5 and older do not support #pragma omp simd -// Do not enable when using GCC 7.2.0 + C++17 due to a bug in gcc -#if (KOKKOS_COMPILER_GNU > 485) && \ - !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) +// Do not enable when using GCC 7.2.0 or 7.3.0 + C++17 due to a bug in gcc +#if (KOKKOS_COMPILER_GNU > 485) && \ + !(KOKKOS_COMPILER_GNU == 720 && defined(KOKKOS_ENABLE_CXX17)) && \ + !(KOKKOS_COMPILER_GNU == 730 && defined(KOKKOS_ENABLE_CXX17)) #define KOKKOSKERNELS_ENABLE_OMP_SIMD #endif // TODO: Check for a clang version that supports #pragma omp simd diff --git a/src/Kokkos_ArithTraits.hpp b/src/Kokkos_ArithTraits.hpp deleted file mode 100644 index 68bcdf79ea..0000000000 --- a/src/Kokkos_ArithTraits.hpp +++ /dev/null @@ -1,3979 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_ARITHTRAITS_HPP -#define KOKKOS_ARITHTRAITS_HPP - -/// \file Kokkos_ArithTraits.hpp -/// \brief Declaration and definition of Kokkos::Details::ArithTraits - -#include -#include -#include -#include - -#ifdef HAVE_KOKKOSKERNELS_QUADMATH -#include -#endif // HAVE_KOKKOSKERNELS_QUADMATH - -#include -#include -#include -#include // std::complex -#include // std::numeric_limits -#ifdef __CUDACC__ -#include -#endif - -namespace { // anonymous - -/// \fn intPowImpl -/// \tparam IntType A built-in integer type. -/// \brief Implementation of intPowSigned and intPowUnsigned. -/// -/// \pre x != 0 -/// \pre y > 0 -/// -/// Use intPowSigned or intPowUnsigned for general y. -template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, - const IntType y) { - // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2 - IntType prod = x; - IntType y_cur = 1; - // If y == 1, then prod stays x. - while (y_cur < y) { - prod = prod * prod; - y_cur = y_cur << 1; - } - // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run - // time to finish the remainder in a linear iteration. - if (y > y_cur) { - const IntType left = y - y_cur; - for (IntType k = 0; k < left; ++k) { - prod = prod * x; - } - } else if (y < y_cur) { - // There's probably a better way to do this in order to avoid the - // (expensive) integer division, but I'm not motivated to think of - // it at the moment. - const IntType left = y_cur - y; - for (IntType k = 0; k < left; ++k) { - prod = prod / x; - } - } - return prod; - - // y = 8: - // - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 - // x^4,4 -> x^8,8 - // - // y = 9: - // - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 - // x^4,4 -> x^8,8 - // - // y - y_cur is what's left over. Just do it one at a time. - // - // y = 3: - // x,1 -> x^2,2 - // x^2,2 -> x^4,4 -} - -// Warning free abs function for types where we don't know whether they are -// signed (like char) -template ::is_signed> -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& val); -}; - -template -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; } -}; - -template -struct integer_abs { - static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; } -}; - -/// \fn intPowSigned -/// \tparam IntType A built-in signed integer type. -/// \brief Compute x raised to the power y. -/// -/// If the arguments are invalid (e.g., if x and y are both zero), the -/// result of this function is undefined. However, this function will -/// not throw an exception in that case. -template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } else if (y < 0) { - if (x == 1) { - return 1; - } else if (x == -1) { - return (y % 2 == 0) ? 1 : -1; - } else { - return 0; // round the fraction to zero - } - } - return intPowImpl(x, y); -} -template -KOKKOS_FORCEINLINE_FUNCTION - typename std::enable_if::is_signed, - IntType>::type - intPowSigned(const IntType x, const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } - return intPowImpl(x, y); -} - -/// \fn intPowUnsigned -/// \tparam IntType A built-in unsigned integer type. -/// \brief Compute x raised to the power y. -/// -/// If the arguments are invalid (e.g., if x and y are both zero), the -/// result of this function is undefined. However, this function will -/// not throw an exception in that case. -template -KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, - const IntType y) { - // It's not entirely clear what to return if x and y are both zero. - // In the case of floating-point numbers, 0^0 is NaN. Here, though, - // I think it's safe to return 0. - if (x == 0) { - return 0; - } else if (y == 0) { - return 1; - } else { - return intPowImpl(x, y); - } -} - -// It might make sense to use special sqrt() approximations for -// integer arguments, like those presented on the following web site: -// -// http://www.azillionmonkeys.com/qed/sqroot.html#implementations -// -// Note that some of the implementations on the above page break ANSI -// C(++) aliasing rules (by assigning to the results of -// reinterpret_cast-ing between int and float). It's also just a -// performance optimization and not required for a reasonable -// implementation. - -} // namespace - -namespace Kokkos { -namespace Details { - -/// \class ArithTraits -/// \brief Traits class for arithmetic on type T. -/// \tparam T "Scalar" type of interest -/// -/// This is a traits class for the "arithmetic" type T. "Arithmetic -/// types" include built-in signed and unsigned integer types, -/// floating-point types, complex-valued types, and anything else that -/// looks like these. This class is useful for implementing numerical -/// algorithms that are generic on the data type. You may also use -/// this class to query attributes of T, like whether it is signed or -/// complex, or its precision. -/// -/// We really did not want to implement this class or expose it to -/// users. It would be much better to use existing traits classes -/// like std::numeric_limits. We decided to implement and expose this -/// class for the following reasons: -///
    -///
  1. std::numeric_limits class methods cannot be used in CUDA -/// device functions, since they themselves are not device -/// functions
  2. -///
  3. Existing traits classes like std::numeric_limits do not -/// provide enough information to implement algorithms that are -/// agnostic of whether T is real-valued or complex-valued.
  4. -///
-/// -/// All class methods must be suitable for parallel kernels, if the -/// type T itself is suitable for parallel kernels. In particular, -/// specializations for types T that make sense to use on a CUDA -/// device must mark all class methods as device (and host) functions, -/// using the KOKKOS_FORCEINLINE_FUNCTION macro. All class methods must be -/// callable both inside and outside a parallel kernel (for CUDA, this -/// means they must be marked as both device and host functions). -/// -/// \section Kokkos_ArithTraits_compat Compatibility -/// -/// Whenever possible, class methods in ArithTraits use the same names -/// as their equivalents in the C++ Standard Library. If this was not -/// possible, for example with isInf and isNan, we explain why in -/// their documentation. -/// -/// This class has redundant typedefs and methods in order to maintain -/// backwards compatibility with Teuchos::ScalarTraits, while -/// preferring forwards (partial) compatibility with -/// std::numeric_limits. Users should prefer typedefs, \c bool -/// constants, and class methods compatible with std::numeric_limits, -/// to those from Teuchos::ScalarTraits. The latter may go away at -/// any time. Furthermore, Teuchos::ScalarTraits contains methods -/// that do not make sense for use as parallel device functions, in -/// particular those relating to pseudorandom number generation that -/// refer to hidden state, so we will never include all class methods -/// from Teuchos::ScalarTraits in ArithTraits. -/// -/// \section Kokkos_ArithTraits_unsupp Unsupported types on CUDA devices -/// -/// CUDA does not support long double or std::complex in device -/// functions. ArithTraits does have specializations for these types, -/// but the class methods therein are not marked as device functions. -/// -/// \section Kokkos_ArithTraits_whyNotC99 What about C99 integer types? -/// -/// C99 and C++11 include typedefs int${N}_t and uint${N}_t, where N -/// is the number of bits in the integer. These typedefs are useful -/// because they make the length of the type explicit. Users are -/// welcome to use these types as the template parameter of -/// ArithTraits. -/// -/// We chose not to use these types when defining full -/// specializations of ArithTraits. This is because the C99 integer -/// types are typedefs, not types in themselves. This makes it -/// impossible to avoid duplicate or missing full specializations of -/// ArithTraits. For example, on my Mac, for CUDA 5.5, gcc 4.2.1, and -/// Clang 3.2, int64_t is a typedef of long long, -/// but long long and long are separate types, even -/// though they have the same length (64 bits). In contrast, on -/// Windows (even Win64), long is a 32-bit type (but a -/// distinct type from int), and long long is a -/// 64-bit type. Thus, if we define full specializations of -/// ArithTraits using only the C99 integer types, we will be -/// missing a specialization for long on at least one -/// platform. -/// -/// Rather than trouble ourselves with trying to figure this out for -/// each platform, we decided to provide specializations only for the -/// integer types in the C89 and C++03 language standards. This -/// includes signed and unsigned versions of char, -/// short, int, and long. We also include -/// long long if your platform supports it. We may thus have -/// left out some C99 integer type, but this is only possible if the -/// C89 / C++03 integer types do not have complete coverage of all -/// powers of two bits from 8 up to the longest provided length (e.g., -/// 64 on a 64-bit system). On all platforms I have encountered, -/// char has 8 bits and short has 16 bits, so I am -/// not worried about missing specializations for int16_t or -/// uint16_t. If you should find that either of these -/// specializations are missing, though, please let us know. -/// -/// Note that char, signed char, and unsigned -/// char are distinct types, whether char is signed or -/// unsigned. (The language standards do not specify whether -/// char is signed or unsigned.) That is, char is -/// not a typedef of signed char or unsigned -/// char. This is why we provide full specializations of -/// ArithTraits for each of these types. Interestingly enough, on my -/// system, char and int8_t are different types, but -/// signed char and int8_t are the same. -/// -/// \section Kokkos_ArithTraits_impl Implementation notes -/// -/// This section contains notes to developers who which to add a -/// partial specialization of this class for a new type T. If you -/// decide to write a default templated implementation, it must not -/// declare any methods as device functions. This ensures correct -/// behavior for arbitrary T, but does require specializations for -/// common types like T = float and double, as well as for other types -/// T that make sense to use on a CUDA device. -template -class ArithTraits { - public: - /// \brief A type that acts like T and works with Kokkos. - /// - /// This is usually just an alias for T. However, some types T do - /// not work well with Kokkos. In that case, we use a mostly - /// equivalent type here. For example, ArithTraits - /// >::val_type is Kokkos::complex. - typedef T val_type; - /// \brief The type of the magnitude (absolute value) of T. - /// - /// We define this as the type returned by abs() in this class. If - /// T is real (not complex), then \c val_type and \c mag_type are - /// usually the same. If T is std::complex for some R, - /// then R and \c mag_type are usually the same. - typedef T mag_type; - - //! Whether ArithTraits has a specialization for T. - static const bool is_specialized = false; - //! Whether T is a signed type (has negative values). - static const bool is_signed = false; - //! Whether T is an integer type. - static const bool is_integer = false; - /// \brief Whether T "uses exact representations." - /// - /// The opposite of is_exact is "is approximate," that is, "may - /// commit rounding error." - static const bool is_exact = false; - //! Whether T is a complex-valued type. - static const bool is_complex = false; - - /// \brief Whether x is Inf. - /// - /// This can only be true for floating-point types T that support - /// Inf. If T is a complex type, we say that a T instance x is Inf - /// if and only if isinf(real(x)) || isinf(imag(x)). - /// - /// Unfortunately we can't call this "isinf" (the equivalent C99 - /// function), because CUDA appears to implement that function using - /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const T& x); - - /// \brief Whether x is NaN (not a number). - /// - /// This can only be true for floating-point types T that support - /// NaN. If T is a complex type, we say that a T instance x is NaN - /// if and only if isNan(real(x)) || isNan(imag(x)). - /// - /// Unfortunately we can't call this "isnan" (the equivalent C99 - /// function), because CUDA appears to implement that function using - /// a macro, rather than using a function (as C++11 requires). - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const T& x); - - //! The absolute value (magnitude) of x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const T& x); - - //! The zero value of T; the arithmetic identity. - static KOKKOS_FORCEINLINE_FUNCTION T zero(); - - //! The one value of T; the multiplicative identity. - static KOKKOS_FORCEINLINE_FUNCTION T one(); - - /// \brief True if this type T is capable of representing the - /// positive infinity as a distinct special value, as with - /// std::numeric_limits::has_infinity. - static constexpr bool has_infinity = false; - - /// \brief Returns the special value "positive infinity", as - /// represented by the floating-point type T. Only meaningful if - /// KokkosArithTraits::has_infinity == true. Provides same - /// functionality as std::numeric_limits::infinity(). - /// - /// \note Would have liked to mark it as constexpr but then would - /// not be able to provide the specialization for std::complex - /// since its constructor only becomes constexpr with C++14. - static KOKKOS_FORCEINLINE_FUNCTION T infinity(); - - /// \brief The minimum possible value of T. - /// - /// If T is a real floating-point type, then this is the minimum - /// positive value, as with std::numeric_limits::min(). - static KOKKOS_FORCEINLINE_FUNCTION T min(); - - //! The maximum possible value of T. - static KOKKOS_FORCEINLINE_FUNCTION T max(); - - /// \brief The real part of x. - /// - /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const T& x); - - /// \brief The imaginary part of x. - /// - /// If \c is_complex is false, then this just returns zero(). - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const T&); - - /// \brief The complex conjugate of x. - /// - /// If \c is_complex is false, then this just returns x. - static KOKKOS_FORCEINLINE_FUNCTION T conj(const T&); - - //! x raised to the power y. - static KOKKOS_FORCEINLINE_FUNCTION T pow(const T& x, const T& y); - - /// \brief The square root of x. - /// - /// If T is an integer type, this is the floor of the square root. - /// If T is a complex-valued type, then this method returns the - /// principal branch of the square root. - /// - /// If T is real-valued and x is negative, the result of the square - /// root is undefined in general. (CUDA does not allow throwing - /// exceptions in device functions.) Implementations should return - /// NaN if the type T supports this. Of course, in that case, the - /// square of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T sqrt(const T& x); - - /// \brief The cubic root of x. - /// - /// If T is an integer type, this is the floor of the cubic root. - /// If T is a complex-valued type, then this method returns the - /// principal branch of the cubic root. - /// - /// If T is real-valued and x is negative, the result of the cubic - /// root is undefined in general. (CUDA does not allow throwing - /// exceptions in device functions.) Implementations should return - /// NaN if the type T supports this. Of course, in that case, the - /// cubic of the result will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T cbrt(const T& x); - - /// \brief The natural (base e) exponential function of x. - /// - /// If T is an integer type, this is the floor of the exponential - /// function. If T is a complex-valued type, then this method - /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$. - /// - static KOKKOS_FORCEINLINE_FUNCTION T exp(const T& x); - - /// \brief The natural (base e) logarithm of x. - /// - /// If T is an integer type, this is the floor of the logarithm. If - /// T is a complex-valued type, then this method returns the - /// principal branch of the logarithm. - /// - /// If T is real-valued and x is negative, the result of the - /// logarithm is undefined in general. (CUDA does not allow - /// throwing exceptions in device functions.) Implementations - /// should return NaN if the type T supports this. Of course, in - /// that case, if y is the result, \f$e^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log(const T& x); - - /// \brief The base ten logarithm of the input. - /// - /// If T is an integer type, this is the floor of the logarithm. If - /// T is a complex-valued type, then this method returns the - /// principal branch of the logarithm. - /// - /// If T is real-valued and x is negative, the result of the - /// logarithm is undefined in general. (CUDA does not allow - /// throwing exceptions in device functions.) Implementations - /// should return NaN if the type T supports this. Of course, in - /// that case, if y is the result, \f$10^y\f$ will not equal x. - static KOKKOS_FORCEINLINE_FUNCTION T log10(const T& x); - - /// Trigonometric and hyperbolic functions are not available - /// for integer types. This is because asin(sin(x)) is not x - /// when x is integer with a rounding error. - /// - /// KJ: log, exp also has this problem. We probably need to - /// disable them for integer types instead of providing - /// functionality with floor. - - /// \brief The sin function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T sin(const T& x); - - /// \brief The cos function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T cos(const T& x); - - /// \brief The tan function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T tan(const T& x); - - /// \brief The sin hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T sinh(const T& x); - - /// \brief The cos hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T cosh(const T& x); - - /// \brief The tan hyperbolic function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T tanh(const T& x); - - /// \brief The asin function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T asin(const T& x); - - /// \brief The acos function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T acos(const T& x); - - /// \brief The atan function of x - /// - static KOKKOS_FORCEINLINE_FUNCTION T atan(const T& x); - - /// \brief Return a silent NaN, if appropriate for T. - /// - /// If T does not implement a silent NaN, the return value is - /// undefined, but calling this method is still allowed. - static KOKKOS_FORCEINLINE_FUNCTION T nan(); - - /// \brief Machine epsilon. - /// - /// If T is an integer type (std::numeric_traits::is_exact is - /// true), then epsilon() returns 0. Otherwise, if T is a - /// floating-point type, it returns machine epsilon that T. - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon(); - - //@{ - /// \name Traits defined for backwards compatibility with - /// Teuchos::ScalarTraits - /// - /// All of the typedefs, \c bool constants, and class methods in - /// this section are defined in order that one may replace most uses - /// of Teuchos::ScalarTraits with ArithTraits. Users who do not - /// have this backwards compatibility requirement should prefer - /// equivalents in other sections. Those class methods which have - /// the same name and meaning in both Teuchos::ScalarTraits and this - /// class, such as log() and pow(), are not in this section. - - //! Same as mag_type; the type of the absolute value (magnitude) of T. - typedef T magnitudeType; - - /// \brief The type with "half the precision" of T. - /// - /// This typedef only makes sense if T is a floating-point type. - typedef T halfPrecision; - - /// \brief The type with "twice the the precision" of T. - /// - /// This typedef only makes sense if T is a floating-point type. - typedef T doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = false; - - /// \brief True if this type T has floating-point parameters. - /// - /// This is true if and only if this specialization of ArithTraits - /// has "machine-specific" parameters eps(), sfmin(), base(), - /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating - /// to floating-point types. - static const bool hasMachineParameters = false; - - //! Return relative machine precision. - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps(); - - //! Return safe minimum (sfmin), such that 1/sfmin does not overflow. - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin(); - - //! Return the base of the scalar type T. - static KOKKOS_FORCEINLINE_FUNCTION int base(); - - //! Return eps*base. - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec(); - - //! Returns the number of (base) digits in the significand. - static KOKKOS_FORCEINLINE_FUNCTION int t(); - - //! 1.0 when rounding occurs in addition, else 0.0. - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd(); - - //! Returns the minimum exponent before (gradual) underflow. - static KOKKOS_FORCEINLINE_FUNCTION int emin(); - - //! Returns the underflow threshold: base^(emin-1) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin(); - - //! Returns the largest exponent before overflow. - static KOKKOS_FORCEINLINE_FUNCTION int emax(); - - //! Overflow theshold: (base^emax)*(1-eps) - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax(); - - //! Same as abs(); return the magnitude of x. - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const T& x); - - //! Same as conj(); return the complex conjugate of x. - static KOKKOS_FORCEINLINE_FUNCTION T conjugate(const T& x); - - /// \brief Whether x is (silent) NaN or Inf. - /// - /// This is the same as isNan(x) || isInf(x). - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const T& x); - - /// \brief The string name of T. - /// - /// Note that this is not a device function. - static std::string name(); - - //! Same as sqrt(x); the square root of x. - static KOKKOS_FORCEINLINE_FUNCTION T squareroot(const T& x); - //@} -}; - -// Since Kokkos::Experimental::half_t falls back to float, only define -// ArithTraits if half_t is a backend specialization -#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT -template <> -class ArithTraits { - public: - typedef Kokkos::Experimental::half_t val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_half(HUGE_VALF); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_half(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_half( - fabs(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_half(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_half(1.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_half(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::Experimental::cast_to_half( - ::pow(Kokkos::Experimental::cast_from_half(x), - Kokkos::Experimental::cast_from_half(y))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sqrt(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_half(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::exp(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::log(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::log10(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sin(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::cos(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_half(x)) -#else - ::tan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::sinh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::cosh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_half( - ::tanh(Kokkos::Experimental::cast_from_half(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_half(x)) -#else - ::asin(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_half(x)) -#else - ::acos(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_half( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_half(x)) -#else - ::atan(Kokkos::Experimental::cast_from_half(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_FP16_SIGNIFICAND_BITS); - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); - } - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "half-float" type. - typedef val_type halfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "half"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_half(CUDART_NAN_F); -#else - return Kokkos::Experimental::cast_to_half( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_FP16_RADIX; - } - // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_half(1.0); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); - } -}; -#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF - -// Since Kokkos::Experimental::bhalf_t falls back to float, only define -// ArithTraits if bhalf_t is a backend specialization -#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT -template <> -class ArithTraits { - public: - typedef Kokkos::Experimental::bhalf_t val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return Kokkos::Experimental::cast_to_bhalf(HUGE_VALF); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isinf; -#endif - return isinf(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifndef __CUDA_ARCH__ - using std::isnan; -#endif - return isnan(Kokkos::Experimental::cast_from_bhalf(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - fabs(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return Kokkos::Experimental::cast_to_bhalf(1.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return Kokkos::Experimental::cast_to_bhalf(0.0F); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return Kokkos::Experimental::cast_to_bhalf( - ::pow(Kokkos::Experimental::cast_from_bhalf(x), - Kokkos::Experimental::cast_from_bhalf(y))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::cbrt(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::exp(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::log(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::log10(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sin(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::cos(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::tan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::sinh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::cosh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( - ::tanh(Kokkos::Experimental::cast_from_bhalf(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::asin(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::acos(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { - return Kokkos::Experimental::cast_to_bhalf( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#else - ::atan(Kokkos::Experimental::cast_from_bhalf(x)) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); - } - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "bhalf-float" type. - typedef val_type bhalfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "bhalf"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#ifdef __CUDA_ARCH__ - return Kokkos::Experimental::cast_to_bhalf(CUDART_NAN_F); -#else - return Kokkos::Experimental::cast_to_bhalf( - std::numeric_limits::quiet_NaN()); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return KOKKOSKERNELS_IMPL_BF16_RADIX; - } - // Use float to allow running on both host and device - static KOKKOS_FORCEINLINE_FUNCTION float prec() { - float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; - float b = (float)base(); - float r = e * b; - return r; - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return Kokkos::Experimental::cast_to_bhalf(1.0); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); - } -}; -#endif // KOKKOS_BHALF_T_IS_FLOAT - -template <> -class ArithTraits { - public: - typedef float val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION float infinity() { return HUGE_VALF; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const float x) { - return ::fabs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION float min() { return -FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION float max() { return FLT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const float) { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION float conj(const float x) { return x; } - static KOKKOS_FORCEINLINE_FUNCTION float pow(const float x, const float y) { - return ::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION float sqrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float cbrt(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float exp(const float x) { - return ::exp(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float log(const float x) { - return ::log(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float log10(const float x) { - return ::log10(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float sin(const float x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float cos(const float x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float tan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float sinh(const float x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float cosh(const float x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float tanh(const float x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float asin(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float acos(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION float atan(const float x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return FLT_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - // C++ doesn't have a standard "half-float" type. - typedef float halfPrecision; - typedef double doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const float x) { - return isNan(x) || isInf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const float x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float conjugate(const float x) { - return conj(x); - } - static std::string name() { return "float"; } - static KOKKOS_FORCEINLINE_FUNCTION float squareroot(const float x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION float nan() { -#if defined(__CUDA_ARCH__) - return CUDART_NAN_F; - // return nan (); //this returns 0??? -#elif defined(__HIP_DEVICE_COMPILE__) - return ::nanf(""); -#else - return std::numeric_limits::quiet_NaN(); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return FLT_MIN; // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { return FLT_RADIX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return FLT_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return FLT_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return FLT_MIN; // ??? // should be base^(emin-1) - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return FLT_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return FLT_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; - -/// \brief Partial specialization for std::complex. -/// -/// The C++ Standard Library (with C++03 at least) only allows -/// std::complex for RealFloatType = float, double, or -/// long double. -template -class ArithTraits > { - public: - //! Kokkos internally replaces std::complex with Kokkos::complex. - typedef ::Kokkos::complex val_type; - typedef RealFloatType mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static std::complex infinity() { - return std::complex(ArithTraits::infinity(), - ArithTraits::infinity()); - } - -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isInf(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(real(x)) || isinf(imag(x)); - } - template <> - static bool isInf(const std::complex& x) { - Kokkos::abort("isInf not available for std::complex!\n"); - return true; - } -#else - static bool isInf(const std::complex& x) { - return Kokkos::Experimental::isinf(real(x)) || - Kokkos::Experimental::isinf(imag(x)); - } -#endif -#ifdef KOKKOS_ENABLE_SYCL - template - static bool isNan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(real(x)) || isnan(imag(x)); - } - template <> - static bool isNan(const std::complex& x) { - Kokkos::abort("isNan not available for std::complex!\n"); - return true; - } -#else - static bool isNan(const std::complex& x) { - return Kokkos::Experimental::isnan(real(x)) || - Kokkos::Experimental::isnan(imag(x)); - } -#endif - static mag_type abs(const std::complex& x) { - return std::abs(x); - } - static std::complex zero() { - return std::complex(ArithTraits::zero(), - ArithTraits::zero()); - } - static std::complex one() { - return std::complex(ArithTraits::one(), - ArithTraits::zero()); - } - static std::complex min() { - return std::complex(ArithTraits::min(), - ArithTraits::zero()); - } - static std::complex max() { - return std::complex(ArithTraits::max(), - ArithTraits::zero()); - } - static mag_type real(const std::complex& x) { - return std::real(x); - } - static mag_type imag(const std::complex& x) { - return std::imag(x); - } - static std::complex conj( - const std::complex& x) { - return std::conj(x); - } - static std::complex pow(const std::complex& x, - const std::complex& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == one()) { - return x; - } else if (y == one() + one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex pow(const std::complex& x, - const RealFloatType& y) { - // Fix for some weird gcc 4.2.1 inaccuracy. - if (y == ArithTraits::one()) { - return x; - } else if (y == ArithTraits::one() + - ArithTraits::one()) { - return x * x; - } else { - return std::pow(x, y); - } - } - static std::complex sqrt( - const std::complex& x) { - return std::sqrt(x); - } - static std::complex cbrt( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static std::complex exp(const std::complex& x) { - return std::exp(x); - } - static std::complex log(const std::complex& x) { - return std::log(x); - } - static std::complex log10( - const std::complex& x) { - return std::log10(x); - } - static std::complex sin(const std::complex& x) { - return std::sin(x); - } - static std::complex cos(const std::complex& x) { - return std::cos(x); - } - static std::complex tan(const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static std::complex sinh( - const std::complex& x) { - return std::sinh(x); - } - static std::complex cosh( - const std::complex& x) { - return std::cosh(x); - } - static std::complex tanh( - const std::complex& x) { - return std::tanh(x); - } - static std::complex asin( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static std::complex acos( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static std::complex atan( - const std::complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using sycl::atan; -#else - using std::atan; -#endif - return atan(x); - } - static std::complex nan() { - const mag_type mag_nan = ArithTraits::nan(); - return std::complex(mag_nan, mag_nan); - } - static mag_type epsilon() { return ArithTraits::epsilon(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef std::complex::halfPrecision> - halfPrecision; - typedef std::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = true; - static bool isnaninf(const std::complex& x) { - return isNan(x) || isInf(x); - } - static mag_type magnitude(const std::complex& x) { - return abs(x); - } - static std::complex conjugate( - const std::complex& x) { - return conj(x); - } - static std::string name() { - return std::string("std::complex<") + ArithTraits::name() + ">"; - } - static std::complex squareroot( - const std::complex& x) { - return sqrt(x); - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return ArithTraits::sfmin(); } - static int base() { return ArithTraits::base(); } - static mag_type prec() { return ArithTraits::prec(); } - static int t() { return ArithTraits::t(); } - static mag_type rnd() { return ArithTraits::one(); } - static int emin() { return ArithTraits::emin(); } - static mag_type rmin() { return ArithTraits::rmin(); } - static int emax() { return ArithTraits::emax(); } - static mag_type rmax() { return ArithTraits::rmax(); } -}; - -template <> -class ArithTraits { - public: - typedef double val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION double infinity() { return HUGE_VAL; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isinf; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isinf; -#endif - return isinf(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::isnan; -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - using sycl::isnan; -#endif - return isnan(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return ::fabs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return -DBL_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return DBL_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { - return 0.0; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return ::pow(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return ::exp(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return ::log(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return ::log10(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { -#if defined(__CUDA_ARCH__) - return CUDART_NAN; - // return nan (); // this returns 0 ??? -#elif defined(__HIP_DEVICE_COMPILE__) - return ::nan(""); -#else - return std::numeric_limits::quiet_NaN(); -#endif // __CUDA_ARCH__ - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return DBL_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef float halfPrecision; -#if defined(__CUDA_ARCH__) - typedef double - doublePrecision; // CUDA doesn't support long double, unfortunately -#elif defined(__HIP_DEVICE_COMPILE__) - typedef double - doublePrecision; // HIP does not support long double unfortunately -#else - typedef long double doublePrecision; -#endif // __CUDA_ARCH__ - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "double"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return DBL_MIN; // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return FLT_RADIX; // same for float as for double - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return eps() * static_cast(base()); - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { return DBL_MANT_DIG; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { return 1.0; } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { return DBL_MIN_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return DBL_MIN; // ??? // should be base^(emin-1) - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { return DBL_MAX_EXP; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return DBL_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; - -// CUDA and HIP do not support long double in device functions, -// so none of the class methods in this specialization are marked -// as device functions. -template <> -class ArithTraits { - public: - typedef long double val_type; - typedef long double mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static long double infinity() { return HUGE_VALL; } - - static bool isInf(const val_type& x) { - using std::isinf; - return isinf(x); - } - static bool isNan(const val_type& x) { - using std::isnan; - return isnan(x); - } - static mag_type abs(const val_type& x) { return ::fabsl(x); } - static val_type zero() { return 0.0; } - static val_type one() { return 1.0; } - static val_type min() { return -LDBL_MAX; } - static val_type max() { return LDBL_MAX; } - static mag_type real(const val_type& x) { return x; } - static mag_type imag(const val_type&) { return zero(); } - static val_type conj(const val_type& x) { return x; } - static val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static val_type sqrt(const val_type& x) { return ::sqrt(x); } - static val_type cbrt(const val_type& x) { return ::cbrtl(x); } - static val_type exp(const val_type& x) { return ::exp(x); } - static val_type log(const val_type& x) { return ::log(x); } - static val_type log10(const val_type& x) { return ::log10(x); } - static val_type sin(const val_type& x) { return ::sin(x); } - static val_type cos(const val_type& x) { return ::cos(x); } - static val_type tan(const val_type& x) { return ::tan(x); } - static val_type sinh(const val_type& x) { return ::sinh(x); } - static val_type cosh(const val_type& x) { return ::cosh(x); } - static val_type tanh(const val_type& x) { return ::tanh(x); } - static val_type asin(const val_type& x) { return ::asin(x); } - static val_type acos(const val_type& x) { return ::acos(x); } - static val_type atan(const val_type& x) { return ::atan(x); } - static val_type nan() { return std::numeric_limits::quiet_NaN(); } - static mag_type epsilon() { return LDBL_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; - // It might be appropriate to use QD's qd_real here. - // For now, long double is the most you get. - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static mag_type magnitude(const val_type& x) { return abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static std::string name() { return "long double"; } - static val_type squareroot(const val_type& x) { return sqrt(x); } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return LDBL_MIN; // ??? - } - static int base() { - return FLT_RADIX; // same for float as for double or long double - } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return LDBL_MANT_DIG; } - static mag_type rnd() { return one(); } - static int emin() { return LDBL_MIN_EXP; } - static mag_type rmin() { return LDBL_MIN; } - static int emax() { return LDBL_MAX_EXP; } - static mag_type rmax() { return LDBL_MAX; } -}; // long double specialization - -#ifdef HAVE_KOKKOSKERNELS_QUADMATH - -// CUDA does not support __float128 in device functions, so none of -// the class methods in this specialization are marked as device -// functions. -template <> -class ArithTraits<__float128> { - public: - typedef __float128 val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static constexpr bool has_infinity = true; - static __float128 infinity() { return 1.0q / 0.0q; } - - static bool isInf(const __float128 x) { return isinfq(x); } - static bool isNan(const __float128 x) { return isnanq(x); } - static mag_type abs(const __float128 x) { return fabsq(x); } - static __float128 zero() { return 0.0; } - static __float128 one() { return 1.0; } - static __float128 min() { return FLT128_MIN; } - static __float128 max() { return FLT128_MAX; } - static mag_type real(const __float128 x) { return x; } - static mag_type imag(const __float128 /* x */) { return 0.0; } - static __float128 conj(const __float128 x) { return x; } - static __float128 pow(const __float128 x, const __float128 y) { - return powq(x, y); - } - static __float128 sqrt(const __float128 x) { return sqrtq(x); } - static __float128 cbrt(const __float128 x) { return cbrtq(x); } - static __float128 exp(const __float128 x) { return exp(x); } - static __float128 log(const __float128 x) { return logq(x); } - static __float128 log10(const __float128 x) { return log10q(x); } - static __float128 sin(const __float128 x) { return sinq(x); } - static __float128 cos(const __float128 x) { return cosq(x); } - static __float128 tan(const __float128 x) { return tanq(x); } - static __float128 sinh(const __float128 x) { return sinhq(x); } - static __float128 cosh(const __float128 x) { return coshq(x); } - static __float128 tanh(const __float128 x) { return tanhq(x); } - static __float128 asin(const __float128 x) { return asinq(x); } - static __float128 acos(const __float128 x) { return acosq(x); } - static __float128 atan(const __float128 x) { return atanq(x); } - static mag_type epsilon() { return FLT128_EPSILON; } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef double halfPrecision; - // Unfortunately, we can't rely on a standard __float256 type. - typedef __float128 doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - static bool isnaninf(const __float128 x) { return isNan(x) || isInf(x); } - static magnitudeType magnitude(const __float128 x) { return abs(x); } - static __float128 conjugate(const __float128 x) { return conj(x); } - static std::string name() { return "__float128"; } - static __float128 squareroot(const __float128 x) { return sqrt(x); } - static __float128 nan() { - return strtoflt128("NAN()", NULL); // ??? - } - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { - return FLT128_MIN; // ??? - } - static int base() { return 2; } - static mag_type prec() { return eps() * static_cast(base()); } - static int t() { return FLT_MANT_DIG; } - static mag_type rnd() { return 1.0; } - static int emin() { return FLT128_MIN_EXP; } - static mag_type rmin() { - return FLT128_MIN; // ??? // should be base^(emin-1) - } - static int emax() { return FLT128_MAX_EXP; } - static mag_type rmax() { - return FLT128_MAX; // ??? // should be (base^emax)*(1-eps) - } -}; -#endif // HAVE_KOKKOSKERNELS_QUADMATH - -template <> -class ArithTraits< ::Kokkos::complex > { - public: - typedef ::Kokkos::complex val_type; - typedef float mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return std::sqrt(::Kokkos::real(x) * ::Kokkos::real(x) + - ::Kokkos::imag(x) * ::Kokkos::imag(x)); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x.real(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { - return x.imag(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? - return val_type(ArithTraits::nan(), ArithTraits::nan()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? - } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return ArithTraits::base(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return ArithTraits::t(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return ArithTraits::rnd(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return ArithTraits::emin(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return ArithTraits::rmin(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return ArithTraits::emax(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return ArithTraits::rmax(); - } -}; - -template <> -class ArithTraits< ::Kokkos::complex > { - public: - typedef ::Kokkos::complex val_type; - typedef double mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = true; - - static constexpr bool has_infinity = true; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { - return val_type(ArithTraits::infinity(), - ArithTraits::infinity()); - } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type x) { - return ArithTraits::isInf(x.real()) || - ArithTraits::isInf(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type x) { - return ArithTraits::isNan(x.real()) || - ArithTraits::isNan(x.imag()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return ::Kokkos::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { - return val_type(ArithTraits::zero(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { - return val_type(ArithTraits::one(), - ArithTraits::zero()); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - return val_type(ArithTraits::min(), - ArithTraits::min()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { - return val_type(ArithTraits::max(), - ArithTraits::max()); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x.real(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type x) { - return x.imag(); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return ::Kokkos::conj(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // val_type y) { - // const mag_type abs_x_square = x.real()*x.real() + x.imag()*x.imag(); - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type half = mag_type(0.5); - // const mag_type alpha = (ArithTraits::pow(abs_x_square, - // half*y.real()) * - // ArithTraits::exp(-y.imag()*arg_x)); - // return val_type(alpha* ArithTraits::cos(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square)), - // alpha* ArithTraits::sin(y.real()*arg_x + - // half*y.imag()*ArithTraits::log(abs_x_square))); - - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type pow (const val_type x, const - // mag_type y) { - // const mag_type arg_x = ArithTraits::atan(x.imag()/x.real()); - // const mag_type alpha = ArithTraits::pow(abs(x),y); - // return val_type(alpha* ArithTraits::cos(y*arg_x), - // alpha* ArithTraits::sin(y*arg_x)); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return ::Kokkos::sqrt(x); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt (const val_type x) { - // const mag_type r = ::Kokkos::abs(x); - // const mag_type phi = ::atan(x.imag()/x.real())/mag_type(3); - // const mag_type re = r* ::cos(phi); - // const mag_type im = r* ::sin(phi); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type exp (const val_type x) { - // const mag_type xx = ::exp(x.real()); - // const mag_type re = xx* ::cos(x.imag()); - // const mag_type im = xx* ::sin(x.imag()); - // return val_type(re,im); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log (const val_type x) { - // return val_type(ArithTraits::log(abs(x)), - // ArithTraits::atan(x.imag()/x.real())); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type log10 (const val_type x) { - // return log(x)/ArithTraits::log(mag_type(10)); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = exp(-ii*x) - exp(ii*x); - // const mag_type half = 0.5; - // return val_type(-half*xx.imag(),half*xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(),half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type e_nix = exp(-ii*x); - // const val_type e_pix = exp( ii*x); - // return ii*(e_nix - e_pix)/(e_nix + e_pix); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // const val_type xx = exp(x) - exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // const val_type xx = exp(x) + exp(-x); - // const mag_type half = 0.5; - // return val_type(half*xx.real(), half*xx.imag()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // const val_type e_2x = exp(2*x); - // return (e_2x - 1)/(e_2x + 1); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // const val_type ii(0, 1); - // const val_type xx = -ii*log(ii*x + sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // const val_type ii = val_type(0, 1); - // const val_type xx = -ii*log(x + ii*sqrt(val_type(1) - x*x)); - // return val_type(xx.imag(),-xx.real()); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // val_type r_val; - // const val_type ii = val_type(0, 1); - // if (x == ii) { - // r_val = val_type(ArithTraits::nan(), - // std::numeric_limits::infinity()); - // } if (x == -ii) { - // r_val = val_type(ArithTraits::nan(), - // -std::numeric_limits::infinity()); - // } else { - // const val_type ii_x = ii*x; - // const mag_type half = 0.5; - // const val_type xx = log(val_type(1) - ii_x) - log(val_type(1) + ii_x); - // r_val = val_type(-half*xx.imag(), half*xx.real()); - // } - // return r_val; - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // ??? - return val_type(ArithTraits::nan(), ArithTraits::nan()); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { - return ArithTraits::epsilon(); // ??? - } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef ::Kokkos::complex::halfPrecision> halfPrecision; - typedef ::Kokkos::complex::doublePrecision> - doublePrecision; - - static const bool isComplex = true; - static const bool isOrdinal = false; - static const bool isComparable = false; - static const bool hasMachineParameters = - ArithTraits::hasMachineParameters; - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static std::string name() { return "Kokkos::complex"; } - // static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot (const val_type x) { - // return sqrt (x); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type eps() { return epsilon(); } - static KOKKOS_FORCEINLINE_FUNCTION mag_type sfmin() { - return ArithTraits::sfmin(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int base() { - return ArithTraits::base(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type prec() { - return ArithTraits::prec(); // ??? - } - static KOKKOS_FORCEINLINE_FUNCTION int t() { - return ArithTraits::t(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rnd() { - return ArithTraits::rnd(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emin() { - return ArithTraits::emin(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmin() { - return ArithTraits::rmin(); - } - static KOKKOS_FORCEINLINE_FUNCTION int emax() { - return ArithTraits::emax(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type rmax() { - return ArithTraits::rmax(); - } -}; - -template <> -class ArithTraits { - public: - typedef char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - // The C(++) standard does not require that char be signed. In - // fact, signed char, unsigned char, and char are distinct types. - // We can use std::numeric_limits here because it's a const bool, - // not a class method. - static const bool is_signed = std::numeric_limits::is_signed; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // This avoids warnings based on whether char is signed or unsigned - return integer_abs::abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return CHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return CHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - if (is_signed) { - return intPowSigned(x, y); - } else { - return intPowUnsigned(x, y); - } - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // C++11 defines std::sqrt for integer arguments. However, we - // currently can't assume C++11. - // - // This cast will result in no loss of accuracy, though it might - // be more expensive than it should, if we were clever about using - // bit operations. - // - // We take the absolute value first to avoid negative arguments. - // Negative real arguments to sqrt(float) return (float) NaN, but - // built-in integer types do not have an equivalent to NaN. - // Casting NaN to an integer type will thus result in some integer - // value which appears valid, but is not. We cannot raise an - // exception in device functions. Thus, we prefer to take the - // absolute value of x first, to avoid issues. Another - // possibility would be to test for a NaN output and convert it to - // some reasonable value (like 0), though this might be more - // expensive than the absolute value interpreted using the ternary - // operator. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef signed char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return SCHAR_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SCHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "signed char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned char val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UCHAR_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned char"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef short val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like this work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return SHRT_MIN; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return SHRT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - //! Integer square root returns a lower bound. - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // short doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return static_cast(-1); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned short val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return USHRT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned short doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned short"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef int val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - // std::abs appears to work with CUDA 5.5 at least, but I'll use - // the ternary expression for maximum generality. Note that this - // expression does not necessarily obey the rules for fabs() with - // NaN input, so it should not be used for floating-point types. - // It's perfectly fine for signed integer types, though. - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { - // Macros like INT_MIN work with CUDA, but - // std::numeric_limits::min() does not, because it is - // not marked as a __device__ function. - return INT_MIN; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return INT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // int doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned int val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // it's unsigned, so it's positive - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return UINT_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - // This will result in no loss of accuracy, though it might be - // more expensive than it should, if we were clever about using - // bit operations. - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::sqrt(static_cast(abs(x))) -#else - ::sqrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned int doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned int"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::abs; - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(abs(x)))); -#else - return static_cast(sqrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long doesn't implement a NaN value, but we can still have it - // return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { - using std::sqrt; -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - return static_cast(sqrt(static_cast(x))); -#else - return static_cast(sqrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(::cbrtl(static_cast(x))); -#else - return static_cast( -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - sycl::cbrt(static_cast(abs(x))) -#else - ::cbrt(static_cast(abs(x))) -#endif - ); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long doesn't implement a NaN value, but we can still - // have it return some "flag" value that can help users find use - // of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef long long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x >= 0 ? x : -x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return LLONG_MIN; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return LLONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowSigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::sqrt; - // IEEE 754 promises that long double has at least 64 significand - // bits, so we can use it to represent any signed or unsigned - // 64-bit integer type exactly. However, CUDA does not implement - // long double for device functions. - return static_cast(sqrt(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - // Casting from a 64-bit integer type to double does result in a - // loss of accuracy. However, it gives us a good first - // approximation. For very large numbers, we may lose some - // significand bits, but will always get within a factor of two - // (assuming correct rounding) of the exact double-precision - // number. We could then binary search between half the result - // and twice the result (assuming the latter is <= INT64_MAX, - // which it has to be, so we don't have to check) to ensure - // correctness. It actually should suffice to check numbers - // within 1 of the result. - return static_cast(sycl::sqrt(static_cast(abs(x)))); -#else - return static_cast(::sqrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::abs; - using std::cbrtl; - return static_cast(cbrtl(static_cast(abs(x)))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(abs(x)))); -#else - return static_cast(::cbrt(static_cast(abs(x)))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(abs(x)))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(abs(x)))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // long long doesn't implement a NaN value, but we can still have - // it return some "flag" value that can help users find use of - // uninitialized data. - return -1; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -template <> -class ArithTraits { - public: - typedef unsigned long long val_type; - typedef val_type mag_type; - - static const bool is_specialized = true; - static const bool is_signed = false; - static const bool is_integer = true; - static const bool is_exact = true; - static const bool is_complex = false; - - static constexpr bool has_infinity = false; - static KOKKOS_FORCEINLINE_FUNCTION val_type infinity() { return 0; } - - static KOKKOS_FORCEINLINE_FUNCTION bool isInf(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION bool isNan(const val_type) { - return false; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type abs(const val_type x) { - return x; // unsigned integers are always nonnegative - } - static KOKKOS_FORCEINLINE_FUNCTION val_type zero() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type one() { return 1; } - static KOKKOS_FORCEINLINE_FUNCTION val_type min() { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type max() { return ULLONG_MAX; } - static KOKKOS_FORCEINLINE_FUNCTION mag_type real(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type imag(const val_type) { return 0; } - static KOKKOS_FORCEINLINE_FUNCTION val_type conj(const val_type x) { - return x; - } - static KOKKOS_FORCEINLINE_FUNCTION val_type pow(const val_type x, - const val_type y) { - return intPowUnsigned(x, y); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sqrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::sqrt; - return static_cast(sqrt(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::sqrt(static_cast(x))); -#else - return static_cast(::sqrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cbrt(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST - using std::cbrtl; - return static_cast(cbrtl(static_cast(x))); -#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) - return static_cast(sycl::cbrt(static_cast(x))); -#else - return static_cast(::cbrt(static_cast(x))); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type exp(const val_type x) { - return static_cast(::exp(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log(const val_type x) { - return static_cast(::log(static_cast(x))); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type log10(const val_type x) { - return static_cast(::log10(static_cast(x))); - } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sin (const val_type x) { - // return static_cast ( ::sin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cos (const val_type x) { - // return static_cast ( ::cos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tan (const val_type x) { - // return static_cast ( ::tan (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type sinh (const val_type x) { - // return static_cast ( ::sinh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type cosh (const val_type x) { - // return static_cast ( ::cosh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type tanh (const val_type x) { - // return static_cast ( ::tanh (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type asin (const val_type x) { - // return static_cast ( ::asin (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type acos (const val_type x) { - // return static_cast ( ::acos (static_cast (x))); - // } - // static KOKKOS_FORCEINLINE_FUNCTION val_type atan (const val_type x) { - // return static_cast ( ::atan (static_cast (x))); - // } - static KOKKOS_FORCEINLINE_FUNCTION val_type nan() { - // unsigned long long doesn't implement a NaN value, but we can - // still have it return some "flag" value that can help users find - // use of uninitialized data. - return max(); - } - static KOKKOS_FORCEINLINE_FUNCTION mag_type epsilon() { return zero(); } - - // Backwards compatibility with Teuchos::ScalarTraits. - typedef mag_type magnitudeType; - typedef val_type halfPrecision; - typedef val_type doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = true; - static const bool isComparable = true; - static const bool hasMachineParameters = false; - static KOKKOS_FORCEINLINE_FUNCTION magnitudeType magnitude(const val_type x) { - return abs(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type conjugate(const val_type x) { - return conj(x); - } - static KOKKOS_FORCEINLINE_FUNCTION bool isnaninf(const val_type) { - return false; - } - static std::string name() { return "unsigned long long"; } - static KOKKOS_FORCEINLINE_FUNCTION val_type squareroot(const val_type x) { - return sqrt(x); - } -}; - -// dd_real and qd_real are floating-point types provided by the QD -// library of David Bailey (LBNL): -// -// http://crd-legacy.lbl.gov/~dhbailey/mpdist/ -// -// dd_real uses two doubles (128 bits), and qd_real uses four doubles -// (256 bits). -// -// Kokkos does not currently support these types in device -// functions. It should be possible to use Kokkos' support for -// aggregate types to implement device function support for dd_real -// and qd_real, but we have not done this yet (as of 09 Jan 2015). -// Hence, the class methods of the ArithTraits specializations for -// dd_real and qd_real are not marked as device functions. -#ifdef HAVE_KOKKOS_QD -template <> -struct ArithTraits { - typedef dd_real val_type; - typedef dd_real mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static inline bool isInf(const val_type& x) { return isinf(x); } - static inline bool isNan(const val_type& x) { return isnan(x); } - static inline mag_type abs(const val_type& x) { return ::abs(x); } - static inline val_type zero() { return val_type(0.0); } - static inline val_type one() { return val_type(1.0); } - static inline val_type min() { return std::numeric_limits::min(); } - static inline val_type max() { return std::numeric_limits::max(); } - static inline mag_type real(const val_type& x) { return x; } - static inline mag_type imag(const val_type&) { return zero(); } - static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static inline val_type sqrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static inline val_type cbrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static inline val_type exp(const val_type& x) { return ::exp(x); } - static inline val_type log(const val_type& x) { - // dd_real puts its transcendental functions in the global namespace. - return ::log(x); - } - static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static inline val_type nan() { return val_type::_nan; } - static val_type epsilon() { return std::numeric_limits::epsilon(); } - - typedef dd_real magnitudeType; - typedef double halfPrecision; - typedef qd_real doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return min(); } - static int base() { return std::numeric_limits::radix; } - static mag_type prec() { return eps() * base(); } - static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } - static int emin() { return std::numeric_limits::min_exponent; } - static mag_type rmin() { return std::numeric_limits::min(); } - static int emax() { return std::numeric_limits::max_exponent; } - static mag_type rmax() { return std::numeric_limits::max(); } - static mag_type magnitude(const val_type& x) { return ::abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static std::string name() { return "dd_real"; } - static val_type squareroot(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } -}; - -template <> -struct ArithTraits { - typedef qd_real val_type; - typedef qd_real mag_type; - - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool is_complex = false; - - static inline bool isInf(const val_type& x) { return isinf(x); } - static inline bool isNan(const val_type& x) { return isnan(x); } - static inline mag_type abs(const val_type& x) { return ::abs(x); } - static inline val_type zero() { return val_type(0.0); } - static inline val_type one() { return val_type(1.0); } - static inline val_type min() { return std::numeric_limits::min(); } - static inline val_type max() { return std::numeric_limits::max(); } - static inline mag_type real(const val_type& x) { return x; } - static inline mag_type imag(const val_type&) { return zero(); } - static inline val_type conj(const val_type& x) { return x; } - static inline val_type pow(const val_type& x, const val_type& y) { - return ::pow(x, y); - } - static inline val_type sqrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } - static inline val_type cbrt(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::cbrt(x); -#else - return ::cbrt(x); -#endif - } - static inline val_type exp(const val_type& x) { return ::exp(x); } - static inline val_type log(const val_type& x) { - // val_type puts its transcendental functions in the global namespace. - return ::log(x); - } - static inline val_type log10(const val_type& x) { return ::log10(x); } - static KOKKOS_FORCEINLINE_FUNCTION val_type sin(const val_type x) { - return ::sin(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cos(const val_type x) { - return ::cos(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::tan(x); -#else - return std::tan(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type sinh(const val_type x) { - return ::sinh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type cosh(const val_type x) { - return ::cosh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type tanh(const val_type x) { - return ::tanh(x); - } - static KOKKOS_FORCEINLINE_FUNCTION val_type asin(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::asin(x); -#else - return ::asin(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type acos(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::acos(x); -#else - return ::acos(x); -#endif - } - static KOKKOS_FORCEINLINE_FUNCTION val_type atan(const val_type x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::atan(x); -#else - return ::atan(x); -#endif - } - static inline val_type nan() { return val_type::_nan; } - static inline val_type epsilon() { - return std::numeric_limits::epsilon(); - } - - typedef qd_real magnitudeType; - typedef dd_real halfPrecision; - // The QD library does not have an "oct-double real" class. One - // could use an arbitrary-precision library like MPFR or ARPREC, - // with the precision set appropriately, to get an - // extended-precision type for qd_real. - typedef qd_real doublePrecision; - - static const bool isComplex = false; - static const bool isOrdinal = false; - static const bool isComparable = true; - static const bool hasMachineParameters = true; - - static mag_type eps() { return epsilon(); } - static mag_type sfmin() { return min(); } - static int base() { return std::numeric_limits::radix; } - static mag_type prec() { return eps() * base(); } - static int t() { return std::numeric_limits::digits; } - static mag_type rnd() { - return std::numeric_limits::round_style == std::round_to_nearest - ? one() - : zero(); - } - static int emin() { return std::numeric_limits::min_exponent; } - static mag_type rmin() { return std::numeric_limits::min(); } - static int emax() { return std::numeric_limits::max_exponent; } - static mag_type rmax() { return std::numeric_limits::max(); } - static mag_type magnitude(const val_type& x) { return ::abs(x); } - static val_type conjugate(const val_type& x) { return conj(x); } - static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } - static std::string name() { return "qd_real"; } - static val_type squareroot(const val_type& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - return sycl::sqrt(x); -#else - return ::sqrt(x); -#endif - } -}; -#endif // HAVE_KOKKOS_QD - -} // namespace Details - -// Promote ArithTraits into Kokkos namespace. At some point, we -// will remove it from the Details namespace completely. We leave -// it there for now, because a lot of code depends on it being -// there. -using Details::ArithTraits; -} // namespace Kokkos - -#endif // KOKKOS_ARITHTRAITS_HPP diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 2b523e1e5f..46b97ee039 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -123,9 +123,7 @@ struct Flush { void init(value_type &update) { update = 0; } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &update, const volatile value_type &input) { - update += input; - } + void join(value_type &update, const value_type &input) { update += input; } KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &update) const { update += _buf[i]; } @@ -201,7 +199,8 @@ struct SIMD { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same::value, + std::is_same::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type."); using value_type = T; }; @@ -718,6 +717,17 @@ KOKKOS_INLINE_FUNCTION iMatrix = iTemp / numRows; } +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value, + void>::type + getIndices(const OrdinalType iTemp, const OrdinalType /*numRows*/, + const OrdinalType numMatrices, OrdinalType &iRow, + OrdinalType &iMatrix) { + iRow = iTemp / numMatrices; + iMatrix = iTemp % numMatrices; +} + template KOKKOS_INLINE_FUNCTION auto transpose_2d_view(ViewType v, const int *order) { constexpr int rank = 2; @@ -842,10 +852,9 @@ KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, template KOKKOS_INLINE_FUNCTION ViewValueType fma_alpha(ViewValueType reg_c, - ScalarType alpha, + ScalarType /*alpha*/, const AlphaTag::No &) { return reg_c; - (void)alpha; } template +template struct SerialCopy { template KOKKOS_INLINE_FUNCTION static int invoke(const AViewType &A, diff --git a/src/batched/dense/KokkosBatched_Gesv.hpp b/src/batched/dense/KokkosBatched_Gesv.hpp new file mode 100644 index 0000000000..cda2225c43 --- /dev/null +++ b/src/batched/dense/KokkosBatched_Gesv.hpp @@ -0,0 +1,180 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GESV_HPP__ +#define __KOKKOSBATCHED_GESV_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include "KokkosBatched_Vector.hpp" + +namespace KokkosBatched { + +struct Gesv { + struct StaticPivoting {}; + struct NoPivoting {}; + + using Default = StaticPivoting; +}; + +/// \brief Serial Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view +/// \param tmp [in]: a rank 2 view used to store temporary variable; dimension +/// must be n x (n+4) where n is the number of rows. +/// +/// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// +/// No nested parallel_for is used inside of the function. +/// + +template +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType tmp); +}; + +/// \brief Team Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param member [in]: TeamPolicy member +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view +/// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// +/// A nested parallel_for with TeamThreadRange is used. +/// + +template +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y); +}; + +/// \brief Team Vector Batched GESV: +/// +/// Solve A_l x_l = b_l for all l = 0, ..., N +/// using a batched LU decomposition, 2 batched triangular solves, and a batched +/// static pivoting. +/// +/// \tparam MatrixType: Input type for the matrix, needs to be a 2D view +/// \tparam VectorType: Input type for the right-hand side and the solution, +/// needs to be a 1D view +/// +/// \param member [in]: TeamPolicy member +/// \param A [in]: matrix, a rank 2 view +/// \param X [out]: solution, a rank 1 view +/// \param B [in]: right-hand side, a rank 1 view +/// +/// Two versions are available (those are chosen based on ArgAlgo): +/// +/// 1. NoPivoting: the solver does not use a pivoting strategy, +/// 2. StaticPivoting: the solver uses a static pivoting strategy that relies +/// on using +/// maximal absolute value of row and column to choose pivots and apply +/// them before calling the LU decomposition. Known limitation: the +/// currently implemented strategy would not work with some matrices such +/// as [[2, 1], [1, 0]], when this is the case, the Gesv (if used with +/// pivoting), will return 1 and print an error message. +/// +/// Two nested parallel_for with both TeamVectorRange and ThreadVectorRange +/// (or one with TeamVectorRange) are used inside. +/// + +template +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y); +}; + +} // namespace KokkosBatched + +#include "KokkosBatched_Gesv_Impl.hpp" + +#endif diff --git a/src/batched/dense/KokkosBatched_LU_Decl.hpp b/src/batched/dense/KokkosBatched_LU_Decl.hpp index 8cffbdc766..9fa2e2b6e3 100644 --- a/src/batched/dense/KokkosBatched_LU_Decl.hpp +++ b/src/batched/dense/KokkosBatched_LU_Decl.hpp @@ -51,4 +51,7 @@ struct LU { } // namespace KokkosBatched +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_LU_Team_Impl.hpp" + #endif diff --git a/src/batched/dense/KokkosBatched_Scale_Decl.hpp b/src/batched/dense/KokkosBatched_Scale_Decl.hpp index f3ea9b0aab..f0675892fc 100644 --- a/src/batched/dense/KokkosBatched_Scale_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Scale_Decl.hpp @@ -3,8 +3,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Vector.hpp" +#include "impl/Kokkos_Error.hpp" namespace KokkosBatched { @@ -12,38 +11,50 @@ namespace KokkosBatched { /// Serial Scale /// -struct SerialScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] SerialScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A){Kokkos::abort( + "KokkosBatched::SerialScale is deprecated: use KokkosBlas::SerialScale " + "instead"); +return 0; +} // namespace KokkosBatched +} +; /// /// Team Scale /// template -struct TeamScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] TeamScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A){Kokkos::abort( + "KokkosBatched::TeamScale is deprecated: use KokkosBlas::TeamScale " + "instead"); +return 0; +} +} +; /// /// TeamVector Scale /// template -struct TeamVectorScale { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] TeamVectorScale{ + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const ScalarType alpha, const AViewType &A){ + Kokkos::abort("KokkosBatched::TeamVectorScale is deprecated: use " + "KokkosBlas::TeamVectorScale instead"); +return 0; +} +} +; } // namespace KokkosBatched -#include "KokkosBatched_Scale_Impl.hpp" - #endif diff --git a/src/batched/dense/KokkosBatched_Set_Decl.hpp b/src/batched/dense/KokkosBatched_Set_Decl.hpp index 4ef0078e50..fd67cdc99b 100644 --- a/src/batched/dense/KokkosBatched_Set_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Set_Decl.hpp @@ -3,46 +3,57 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Vector.hpp" +#include "impl/Kokkos_Error.hpp" namespace KokkosBatched { /// /// Serial Set /// -struct SerialSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] SerialSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A){Kokkos::abort( + "KokkosBatched::SerialSet is deprecated: use KokkosBlas::SerialSet " + "instead"); +return 0; +} // namespace KokkosBatched +} +; /// /// Team Set /// template -struct TeamSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] TeamSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A){Kokkos::abort( + "KokkosBatched::TeamSet is deprecated: use KokkosBlas::TeamSet " + "instead"); +return 0; +} +} +; /// /// TeamVector Set /// template -struct TeamVectorSet { - template - KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A); -}; +struct [[deprecated]] TeamVectorSet{ + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const ScalarType alpha, const AViewType &A){ + Kokkos::abort("KokkosBatched::TeamVectorSet is deprecated: use " + "KokkosBlas::TeamVectorSet instead"); +return 0; +} +} +; } // namespace KokkosBatched -#include "KokkosBatched_Set_Impl.hpp" - #endif diff --git a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp index f11210253e..32980219bf 100644 --- a/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_FindAmax_Internal.hpp @@ -43,7 +43,7 @@ struct TeamVectorFindAmaxInternal { if (m > 0) { using reducer_value_type = typename Kokkos::MaxLoc::value_type; - reducer_value_type value; + reducer_value_type value{}; Kokkos::MaxLoc reducer_value(value); Kokkos::parallel_reduce( Kokkos::TeamVectorRange(member, m), diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 7bc5529fcc..d6331e215d 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -59,6 +59,21 @@ namespace Impl { /// CT/NT, NT/CT, CT/CT /// +struct LayoutLeftTag {}; +struct LayoutRightTag {}; +template +struct TagFromLayoutHelper; +template <> +struct TagFromLayoutHelper { + using tag = LayoutLeftTag; +}; +template <> +struct TagFromLayoutHelper { + using tag = LayoutRightTag; +}; +template +using TagFromLayout = typename TagFromLayoutHelper::tag; + // TODO - scaling between (32x32, 64x64) // Option 0: Increase number of tiles and figure out how to map kokkos teams // into cuda grid. Keep team size and vector lanes constant. @@ -117,7 +132,8 @@ class BatchedDblBufGemm { private: void __run() { - using policy_type = Kokkos::TeamPolicy; + using policy_type = + Kokkos::TeamPolicy, execution_space_type>; using member_type = typename policy_type::member_type; // Compile-time expressions required for functor-level register allocations: @@ -335,8 +351,7 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void operator()(const Kokkos::LayoutRight &, - const MemberType &member) const { + void operator()(LayoutRightTag, const MemberType &member) const { // TODO: use Kokkos view with compile-time size to allocating register?? // Then we can use local deep copy for prefetch_reg population. // Allocate registers used for prefetching @@ -503,8 +518,7 @@ class BatchedDblBufGemm { } KOKKOS_INLINE_FUNCTION - void operator()(const Kokkos::LayoutLeft &, - const MemberType &member) const { + void operator()(LayoutLeftTag, const MemberType &member) const { // TODO: use Kokkos view with compile-time size to allocating register?? // Then we can use local deep copy for prefetch_reg population. // Allocate registers used for prefetching diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp index f2b009fe2f..1548d602e2 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Serial_Internal.hpp @@ -5,8 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -41,9 +41,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) - SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); if (alpha != zero) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -81,9 +81,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - SerialSetInternal ::invoke(m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, C, cs0, cs1); else if (beta != one) - SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, beta, C, cs0, cs1); if (alpha != zero) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp index b0c1f9c1ae..a516f765a1 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_TeamVector_Internal.hpp @@ -5,8 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -39,9 +39,11 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, + cs1); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, + cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -79,9 +81,11 @@ TeamVectorGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, C, cs0, + cs1); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, beta, C, + cs0, cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp index 73d831586b..4f147a98fc 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_Team_Internal.hpp @@ -6,8 +6,8 @@ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixC_Serial_Impl.hpp" @@ -41,9 +41,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, + cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; @@ -82,9 +83,10 @@ KOKKOS_INLINE_FUNCTION int TeamGemmInternal::invoke( const ScalarType one(1.0), zero(0.0); if (beta == zero) - TeamSetInternal ::invoke(member, m, n, zero, C, cs0, cs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, C, cs0, cs1); else if (beta != one) - TeamScaleInternal::invoke(member, m, n, beta, C, cs0, cs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, beta, C, cs0, + cs1); if (alpha != ScalarType(0.0)) { if (m <= 0 || n <= 0 || k <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp index fbd4a1e2d3..ef499b82fd 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -39,9 +38,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - SerialSetInternal ::invoke(m, zero, y, ys0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0); else if (beta != one) - SerialScaleInternal::invoke(m, beta, y, ys0); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -78,9 +77,9 @@ KOKKOS_INLINE_FUNCTION int SerialGemvInternal::invoke( constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); if (beta == zero) - SerialSetInternal ::invoke(m, zero, y, ys0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, y, ys0); else if (beta != one) - SerialScaleInternal::invoke(m, beta, y, ys0); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 7e21019f94..0cad2c6c80 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -30,9 +30,17 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamVectorGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -60,9 +68,17 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamVectorGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index f4054030a3..406115aa4f 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -28,6 +27,20 @@ struct TeamVectorGemvInternal { assert(false && "Error: encounter dummy impl"); return 0; } + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType & /*member*/, const int /*N*/, const int /*m*/, + const int /*n*/, const ScalarType /*alpha*/, + const ValueType *KOKKOS_RESTRICT /*A*/, const int /*as0*/, + const int /*as1*/, const int /*as2*/, + const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, + const int /*xs1*/, const ScalarType /*beta*/, + /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/, + const int /*ys1*/) { + assert(false && "Error: encounter dummy impl"); + return 0; + } }; template <> @@ -44,9 +57,9 @@ TeamVectorGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - TeamVectorSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -69,6 +82,55 @@ TeamVectorGemvInternal::invoke( return 0; } +template <> +template +KOKKOS_INLINE_FUNCTION int +TeamVectorGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + const ScalarType one(1.0), zero(0.0); + + // y_l = beta y_l + alpha A_l x_l for l in range(0, N) + // y_l (m), A_l(m x n), B_l(n) + + if (beta == zero) + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); + else if (beta != one) + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) + t += A[as0 * iMatrix + as1 * iRow + as2 * i] * + X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); + } + return 0; +} + } // namespace KokkosBatched #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index 73ee2b9ad3..d32232524a 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -30,9 +30,17 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -60,9 +68,17 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + if (AViewType::Rank == 2) + return TeamGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + else + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index 98415cd034..cf611db5ca 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerMultipleDotProduct_Serial_Impl.hpp" namespace KokkosBatched { @@ -24,6 +23,15 @@ struct TeamGemvInternal { const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, /**/ ValueType *KOKKOS_RESTRICT y, const int ys0); + + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT x, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); }; template <> @@ -39,9 +47,9 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( // y (m), A(m x n), B(n) if (beta == zero) - TeamSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -78,9 +86,9 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); if (beta == zero) - TeamSetInternal ::invoke(member, m, zero, y, ys0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); else if (beta != one) - TeamScaleInternal::invoke(member, m, beta, y, ys0); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); if (alpha != zero) { if (m <= 0 || n <= 0) return 0; @@ -105,6 +113,54 @@ KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( return 0; } + +template <> +template +KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( + const MemberType &member, const int N, const int m, const int n, + const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, + const int as1, const int as2, const ValueType *KOKKOS_RESTRICT X, + const int xs0, const int xs1, const ScalarType beta, + /**/ ValueType *KOKKOS_RESTRICT Y, const int ys0, const int ys1) { + const ScalarType one(1.0), zero(0.0); + + // y_l = beta y_l + alpha A_l x_l for l in range(0, N) + // y_l (m), A_l(m x n), B_l(n) + + if (beta == zero) + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] = zero; + }); + else if (beta != one) + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + getIndices(iTemp, m, N, iRow, iMatrix); + Y[ys0 * iMatrix + ys1 * iRow] *= beta; + }); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, N * m), + [&](const int &iTemp) { + int iRow, iMatrix; + ValueType t(0); + getIndices(iTemp, m, N, iRow, iMatrix); + for (int i = 0; i < n; ++i) + t += A[as0 * iMatrix + as1 * iRow + as2 * i] * + X[xs0 * iMatrix + xs1 * i]; + Y[ys0 * iMatrix + ys1 * iRow] += alpha * t; + }); + } + return 0; +} } // namespace KokkosBatched #endif diff --git a/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp new file mode 100644 index 0000000000..a9e10a1ebd --- /dev/null +++ b/src/batched/dense/impl/KokkosBatched_Gesv_Impl.hpp @@ -0,0 +1,790 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GESV_IMPL_HPP__ +#define __KOKKOSBATCHED_GESV_IMPL_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" +#include +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Copy_Decl.hpp" + +namespace KokkosBatched { + +struct SerialStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2); +}; + +template +struct TeamStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); +}; + +template +struct TeamVectorStaticPivoting { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2); +}; + +template +KOKKOS_INLINE_FUNCTION int SerialStaticPivoting::invoke( + const MatrixType1 A, const MatrixType2 PDAD, const VectorType1 Y, + const VectorType2 PDY, const VectorType2 D2, const VectorType2 tmp_v_1, + const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + const size_t n = A.extent(0); + + // First, the algorithm loops over the rows and columns and search + // for the maximal absolute value per row and column. + for (size_t i = 0; i < n; ++i) { + D2(i) = Kokkos::ArithTraits::zero(); + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + for (size_t j = 0; j < n; ++j) { + if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); + if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); + } + D2(i) = 1. / D2(i); + } + + // Then, the inverse of the maximal value per column is used to scale + // A by the right. + for (size_t i = 0; i < n; ++i) { + for (size_t j = 0; j < n; ++j) { + A(i, j) *= D2(j); + } + } + + // Once again, the algorithm loops over the rows and store the maximal + // absolute value per row but after the right scalling and do a left scalling + // of A and Y. + value_type D1_i; + for (size_t i = 0; i < n; ++i) { + D1_i = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { + if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); + } + D1_i = 1. / D1_i; + for (size_t j = 0; j < n; ++j) { + A(i, j) *= D1_i; + } + Y(i) *= D1_i; + } + + // Finally, the algorithm starts to loop over the rows in an order such that + // their initial maximal absolute value decrease (it uses the tmp_v_1 to do + // so), then for a given row, it finds the available column with the largest + // absolute value. If this value is zero, the algorithm failed to compute a + // good pivot, otherwise it puts the current row to the found column index and + // it labels the row and column index as unavailable and continue the loop + // over the rows. + // + for (size_t i = 0; i < n; ++i) { + int row_index = 0; + int col_index = 0; + value_type tmp_0 = Kokkos::ArithTraits::zero(); + value_type tmp_1 = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { + if (tmp_0 < tmp_v_1(j)) { + tmp_0 = tmp_v_1(j); + row_index = j; + } + } + for (size_t j = 0; j < n; ++j) { + if (tmp_1 < Kokkos::abs(A(row_index, j) * tmp_v_2(j))) { + tmp_1 = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + col_index = j; + } + } + if (tmp_1 == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); + + for (size_t j = 0; j < n; ++j) { + PDAD(col_index, j) = A(row_index, j); + } + PDY(col_index) = Y(row_index); + } + + return 0; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamStaticPivoting::invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = + typename Kokkos::MaxLoc::value_type; + // This implementation follows the strategy of SerialStaticPivoting but uses + // an extra level of parallelism. + + // Made this non-const in order to WORKAROUND issue #349 (Credit to C. Trott) + size_t n = A.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + D2(i) = Kokkos::ArithTraits::zero(); + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + for (size_t j = 0; j < n; ++j) { + if (D2(i) < Kokkos::abs(A(j, i))) D2(i) = Kokkos::abs(A(j, i)); + if (tmp_v_1(i) < Kokkos::abs(A(i, j))) tmp_v_1(i) = Kokkos::abs(A(i, j)); + } + D2(i) = 1. / D2(i); + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + for (size_t j = 0; j < n; ++j) { + A(i, j) *= D2(j); + } + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + value_type D1_i = Kokkos::ArithTraits::zero(); + for (size_t j = 0; j < n; ++j) { + if (D1_i < Kokkos::abs(A(i, j))) D1_i = Kokkos::abs(A(i, j)); + } + D1_i = 1. / D1_i; + for (size_t j = 0; j < n; ++j) { + A(i, j) *= D1_i; + } + Y(i) *= D1_i; + }); + + for (size_t i = 0; i < n; ++i) { + int row_index, col_index; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (tmp_v_1(j) > update.val) { + update.val = tmp_v_1(j); + update.loc = j; + } + }, + reducer_value); + row_index = value.loc; + value.loc = 0; + value.val = Kokkos::ArithTraits::zero(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) { + update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + update.loc = j; + } + }, + reducer_value); + col_index = value.loc; + if (value.val == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); + + for (size_t j = 0; j < n; ++j) { + PDAD(col_index, j) = A(row_index, j); + } + PDY(col_index) = Y(row_index); + } + return 0; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorStaticPivoting::invoke( + const MemberType &member, const MatrixType1 A, const MatrixType2 PDAD, + const VectorType1 Y, const VectorType2 PDY, const VectorType2 D2, + const VectorType2 tmp_v_1, const VectorType2 tmp_v_2) { + using value_type = typename MatrixType1::non_const_value_type; + using reducer_value_type = + typename Kokkos::MaxLoc::value_type; + // This implementation follows the strategy of SerialStaticPivoting but uses + // two extra levels of parallelism. + + const size_t n = A.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + D2(i) = Kokkos::ArithTraits::zero(); + tmp_v_1(i) = 0; + tmp_v_2(i) = 1.; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(j, i)) > update.val) { + update.val = Kokkos::abs(A(j, i)); + update.loc = j; + } + }, + reducer_value); + D2(i) = 1. / value.val; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(i, j)) > update.val) { + update.val = Kokkos::abs(A(i, j)); + update.loc = j; + } + }, + reducer_value); + tmp_v_1(i) = value.val; + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A(i, j) *= D2(j); }); + }); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), [&](const int &i) { + value_type D1_i = Kokkos::ArithTraits::zero(); + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(i, j)) > update.val) { + update.val = Kokkos::abs(A(i, j)); + update.loc = j; + } + }, + reducer_value); + D1_i = 1. / value.val; + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, n), + [&](const int &j) { A(i, j) *= D1_i; }); + Y(i) *= D1_i; + }); + + for (size_t i = 0; i < n; ++i) { + int row_index, col_index; + reducer_value_type value; + Kokkos::MaxLoc reducer_value(value); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (tmp_v_1(j) > update.val) { + update.val = tmp_v_1(j); + update.loc = j; + } + }, + reducer_value); + row_index = value.loc; + value.loc = 0; + value.val = Kokkos::ArithTraits::zero(); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, n), + [&](const int &j, reducer_value_type &update) { + if (Kokkos::abs(A(row_index, j) * tmp_v_2(j)) > update.val) { + update.val = Kokkos::abs(A(row_index, j) * tmp_v_2(j)); + update.loc = j; + } + }, + reducer_value); + col_index = value.loc; + if (value.val == Kokkos::ArithTraits::zero()) return 1; + tmp_v_1(row_index) = Kokkos::ArithTraits::zero(); + tmp_v_2(col_index) = Kokkos::ArithTraits::zero(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), [&](const int &j) { + PDAD(col_index, j) = A(row_index, j); + }); + PDY(col_index) = Y(row_index); + } + return 0; +} + +template +KOKKOS_INLINE_FUNCTION void SerialHadamard1D(const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const size_t n = X.extent(0); + + for (size_t i = 0; i < n; ++i) { + DX(i) = D(i) * X(i); + } +} + +template +KOKKOS_INLINE_FUNCTION void TeamHadamard1D(const MemberType &member, + const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const size_t n = X.extent(0); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, n), + [&](const size_t &i) { DX(i) = D(i) * X(i); }); +} + +template +KOKKOS_INLINE_FUNCTION void TeamVectorHadamard1D(const MemberType &member, + const VectorType1 X, + const VectorType2 D, + const VectorType3 DX) { + const size_t n = X.extent(0); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, n), + [&](const size_t &i) { DX(i) = D(i) * X(i); }); +} + +/// +/// Serial Impl +/// =========== +template <> +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType tmp) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + + if (A.extent(0) != tmp.extent(0) || A.extent(1) + 4 != tmp.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and tmp do not match: A: " + "%d x %d, tmp (note: its second dimension should be the second " + "dimension of A + 4): %d x %d\n", + (int)A.extent(0), (int)A.extent(1), (int)tmp.extent(0), + (int)tmp.extent(1)); + return 1; + } + + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + + const int n = A.extent(0); + + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + if (SerialStaticPivoting::invoke(A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == + 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } + + int r_val = SerialLU::invoke(PDAD); + + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, PDAD, PDY); + + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, PDAD, PDY); + + if (r_val == 0) SerialHadamard1D(PDY, D2, X); + return r_val; + } +}; + +template <> +struct SerialGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MatrixType A, + const VectorType X, + const VectorType Y, + const MatrixType /*tmp*/) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + + int r_val = SerialLU::invoke(A); + + if (r_val == 0) r_val = SerialCopy::invoke(Y, X); + + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, A, X); + + if (r_val == 0) + r_val = + SerialTrsm::invoke(1.0, A, X); + + return r_val; + } +}; + +/// +/// Team Impl +/// ========= + +template +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = Kokkos::View< + typename MatrixType::non_const_value_type **, + typename MatrixType::execution_space::scratch_memory_space>; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + if (TeamStaticPivoting::invoke(member, A, PDAD, Y, PDY, D2, + tmp_v_1, tmp_v_2) == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } + member.team_barrier(); + + int r_val = + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + if (r_val == 0) { + r_val = TeamTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } + + if (r_val == 0) { + r_val = + TeamTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } + + if (r_val == 0) { + TeamHadamard1D(member, PDY, D2, X); + member.team_barrier(); + } + + return r_val; + } +}; + +template +struct TeamGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + + int r_val = TeamLU::invoke(member, A); + member.team_barrier(); + + if (r_val == 0) { + TeamCopy::invoke(member, Y, X); + member.team_barrier(); + } + + if (r_val == 0) { + TeamTrsm::invoke(member, 1.0, A, X); + member.team_barrier(); + } + + if (r_val == 0) { + TeamTrsm::invoke(member, 1.0, A, + X); + member.team_barrier(); + } + + return r_val; + } +}; + +/// +/// TeamVector Impl +/// ========= + +template +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + using ScratchPadMatrixViewType = Kokkos::View< + typename MatrixType::non_const_value_type **, + typename MatrixType::execution_space::scratch_memory_space>; + + const int n = A.extent(0); + + ScratchPadMatrixViewType tmp(member.team_scratch(0), n, n + 4); + auto PDAD = Kokkos::subview(tmp, Kokkos::ALL, Kokkos::make_pair(0, n)); + auto PDY = Kokkos::subview(tmp, Kokkos::ALL, n); + auto D2 = Kokkos::subview(tmp, Kokkos::ALL, n + 1); + auto tmp_v_1 = Kokkos::subview(tmp, Kokkos::ALL, n + 2); + auto tmp_v_2 = Kokkos::subview(tmp, Kokkos::ALL, n + 3); + + if (TeamVectorStaticPivoting::invoke( + member, A, PDAD, Y, PDY, D2, tmp_v_1, tmp_v_2) == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: the currently implemented static pivoting " + "failed.\n"); + return 1; + } + + member.team_barrier(); + + int r_val = + TeamLU::invoke(member, PDAD); + member.team_barrier(); + + if (r_val == 0) { + TeamVectorTrsm::invoke(member, 1.0, + PDAD, PDY); + member.team_barrier(); + } + + if (r_val == 0) { + TeamVectorTrsm::invoke(member, + 1.0, PDAD, + PDY); + member.team_barrier(); + } + + if (r_val == 0) { + TeamVectorHadamard1D(member, PDY, D2, X); + member.team_barrier(); + } + + return r_val; + } +}; + +template +struct TeamVectorGesv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const MatrixType A, + const VectorType X, + const VectorType Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: MatrixType is not a Kokkos::View."); + static_assert(Kokkos::is_view::value, + "KokkosBatched::gesv: VectorType is not a Kokkos::View."); + static_assert(MatrixType::Rank == 2, + "KokkosBatched::gesv: MatrixType must have rank 2."); + static_assert(VectorType::Rank == 1, + "KokkosBatched::gesv: VectorType must have rank 1."); + + // Check compatibility of dimensions at run time. + if (A.extent(0) != X.extent(0) || A.extent(1) != X.extent(0) || + A.extent(0) != Y.extent(0)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::gesv: dimensions of A and X and Y do not match: A: " + "%d x %d, X: %d, Y: %d\n", + (int)A.extent(0), (int)A.extent(1), (int)X.extent(0), + (int)Y.extent(0)); + return 1; + } +#endif + + int r_val = TeamLU::invoke(member, A); + member.team_barrier(); + + if (r_val == 0) { + TeamVectorCopy::invoke(member, Y, X); + member.team_barrier(); + } + + if (r_val == 0) { + TeamVectorTrsm::invoke(member, 1.0, + A, X); + member.team_barrier(); + } + + if (r_val == 0) { + TeamVectorTrsm::invoke(member, + 1.0, A, X); + member.team_barrier(); + } + + return r_val; + } +}; + +} // namespace KokkosBatched + +#endif diff --git a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp index 58cd9bad2d..4c0f39097f 100644 --- a/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_HessenbergFormQ_Serial_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_Serial_Internal.hpp" @@ -37,7 +37,8 @@ struct SerialHessenbergFormQInternal { /// B is m x m // set identity if (is_Q_zero) - SerialSetInternal::invoke(m, value_type(1), Q, qs0 + qs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, value_type(1), Q, + qs0 + qs1); else SerialSetIdentityInternal::invoke(m, Q, qs0, qs1); diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp index 46feefb91b..23171c063e 100644 --- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_Serial_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_Serial_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp index 52178a095a..13a4ef4636 100644 --- a/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_QR_FormQ_TeamVector_Internal.hpp @@ -4,7 +4,7 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_SetIdentity_Internal.hpp" #include "KokkosBatched_ApplyQ_TeamVector_Internal.hpp" @@ -36,7 +36,8 @@ struct TeamVectorQR_FormQ_Internal { // set identity if (is_Q_zero) - TeamVectorSetInternal::invoke(member, m, value_type(1), Q, qs0 + qs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, value_type(1), + Q, qs0 + qs1); else TeamVectorSetIdentityInternal::invoke(member, m, n, Q, qs0, qs1); member.team_barrier(); diff --git a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp index 0c7007bdf3..446ba50c03 100644 --- a/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_SVD_Serial_Internal.hpp @@ -37,10 +37,15 @@ struct SerialSVDInternal { KOKKOS_INLINE_FUNCTION static void symEigen2x2(value_type a11, value_type a21, value_type a22, value_type& e1, value_type& e2) { - value_type a = Kokkos::ArithTraits::one(); - value_type b = -a11 - a22; - value_type c = a11 * a22 - a21 * a21; - value_type sqrtDet = Kokkos::Experimental::sqrt(b * b - 4 * a * c); + value_type a = Kokkos::ArithTraits::one(); + value_type b = -a11 - a22; + value_type c = a11 * a22 - a21 * a21; +#if KOKKOS_VERSION >= 30699 + using Kokkos::sqrt; +#else + using Kokkos::Experimental::sqrt; +#endif + value_type sqrtDet = sqrt(b * b - 4 * a * c); e1 = (-b + sqrtDet) / (2 * a); e2 = (-b - sqrtDet) / (2 * a); } diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp deleted file mode 100644 index b4e865ddea..0000000000 --- a/src/batched/dense/impl/KokkosBatched_Scale_Impl.hpp +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef __KOKKOSBATCHED_SCALE_IMPL_HPP__ -#define __KOKKOSBATCHED_SCALE_IMPL_HPP__ - -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - -namespace KokkosBatched { - -/// -/// Serial Impl -/// =========== -template -KOKKOS_INLINE_FUNCTION int SerialScale::invoke(const ScalarType alpha, - const AViewType &A) { - return SerialScaleInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); -} - -/// -/// Team Impl -/// ========= - -template -template -KOKKOS_INLINE_FUNCTION int TeamScale::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} - -/// -/// TeamVector Impl -/// =============== - -template -template -KOKKOS_INLINE_FUNCTION int TeamVectorScale::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamVectorScaleInternal::invoke(member, A.extent(0), A.extent(1), - alpha, A.data(), A.stride_0(), - A.stride_1()); -} - -} // namespace KokkosBatched - -#endif diff --git a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp deleted file mode 100644 index 148e051ce4..0000000000 --- a/src/batched/dense/impl/KokkosBatched_Set_Impl.hpp +++ /dev/null @@ -1,48 +0,0 @@ -#ifndef __KOKKOSBATCHED_SET_IMPL_HPP__ -#define __KOKKOSBATCHED_SET_IMPL_HPP__ - -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" - -namespace KokkosBatched { - -/// -/// Serial Impl -/// =========== - -template -KOKKOS_INLINE_FUNCTION int SerialSet::invoke(const ScalarType alpha, - const AViewType &A) { - return SerialSetInternal::invoke(A.extent(0), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_1()); -} - -/// -/// Team Impl -/// ========= - -template -template -KOKKOS_INLINE_FUNCTION int TeamSet::invoke(const MemberType &member, - const ScalarType alpha, - const AViewType &A) { - return TeamSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} - -/// -/// TeamVector Impl -/// =============== - -template -template -KOKKOS_INLINE_FUNCTION int TeamVectorSet::invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A) { - return TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), alpha, - A.data(), A.stride_0(), A.stride_1()); -} -} // end namespace KokkosBatched - -#endif diff --git a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp index b0e2ea5b80..c6aec99d18 100644 --- a/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_ShiftedTrsv_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" diff --git a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp index 9b5cc055e3..ac53992064 100644 --- a/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trmm_Serial_Internal.hpp @@ -47,8 +47,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" namespace KokkosBatched { @@ -152,9 +152,10 @@ SerialTrmmInternalLeftLower::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -240,9 +241,10 @@ SerialTrmmInternalRightLower::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -321,9 +323,10 @@ SerialTrmmInternalLeftUpper::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -401,9 +404,10 @@ SerialTrmmInternalRightUpper::invoke( if (bm <= 0 || bn <= 0 || am <= 0 || an <= 0) return 0; if (alpha == zero) - SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(bm, bn, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(bm, bn, alpha, B, bs0, bs1); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp index b317bed4f7..b29b54931f 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerGemmFixA_Serial_Impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" @@ -39,9 +38,10 @@ SerialTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -87,9 +87,10 @@ SerialTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0), minus_one(-1.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftLowerUnitDiag trsm_u(as0, as1, bs0, bs1); @@ -154,9 +155,10 @@ SerialTrsmInternalLeftUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -202,9 +204,10 @@ SerialTrsmInternalLeftUpper::invoke( constexpr int mbAlgo = Algo::Trsm::Blocked::mb(); if (alpha == zero) - SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { - if (alpha != one) SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp index 0afa92ae6e..08819e8c18 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_TeamVector_Internal.hpp @@ -5,8 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -35,10 +35,12 @@ TeamVectorTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, + bs1); else { if (alpha != one) - TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, + bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -96,10 +98,12 @@ TeamVectorTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamVectorSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, n, zero, B, bs0, + bs1); else { if (alpha != one) - TeamVectorScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, n, alpha, B, + bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; diff --git a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp index 37e5051675..f9e2bed8f8 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsm_Team_Internal.hpp @@ -6,9 +6,8 @@ #include "KokkosBatched_Util.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemm_Team_Internal.hpp" @@ -39,10 +38,11 @@ TeamTrsmInternalLeftLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -90,10 +90,11 @@ TeamTrsmInternalLeftLower::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; /// @@ -173,10 +174,11 @@ TeamTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; ValueType *KOKKOS_RESTRICT B0 = B; @@ -229,10 +231,11 @@ TeamTrsmInternalLeftUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal ::invoke(member, m, n, zero, B, bs0, bs1); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, n, zero, B, bs0, bs1); else { if (alpha != one) - TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, n, alpha, B, bs0, + bs1); if (m <= 0 || n <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, bs1); diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp index fb28ea5a9c..926003083a 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Serial_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_serial_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Serial_Internal.hpp" @@ -42,9 +41,10 @@ SerialTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -79,9 +79,10 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalLower::invoke( constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -135,9 +136,10 @@ SerialTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; @@ -170,9 +172,10 @@ KOKKOS_INLINE_FUNCTION int SerialTrsvInternalUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - SerialSetInternal::invoke(m, zero, b, bs0); + KokkosBlas::Impl::SerialSetInternal::invoke(m, zero, b, bs0); else { - if (alpha != one) SerialScaleInternal::invoke(m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::SerialScaleInternal::invoke(m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp index ad50e6fc2a..b0da8f1f2d 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_TeamVector_Internal.hpp @@ -5,8 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" namespace KokkosBatched { @@ -43,9 +43,11 @@ TeamVectorTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, + bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -105,9 +107,11 @@ TeamVectorTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamVectorSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamVectorScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, alpha, b, + bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index 60b941e1ba..aaf72e9876 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -5,9 +5,8 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" - +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" #include "KokkosBatched_Gemv_Team_Internal.hpp" @@ -45,9 +44,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -90,9 +90,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( constexpr int mbAlgo = Algo::Trsv::Blocked::mb(); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; /// case GPU: team size is large and blocksize (mb,nb) is small @@ -154,9 +155,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( const ScalarType one(1.0), zero(0.0); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; ValueType *KOKKOS_RESTRICT b0 = b; @@ -197,9 +199,10 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( // note that parallel range is different ( m*n vs m-1*n); if (alpha == zero) - TeamSetInternal::invoke(member, m, zero, b, bs0); + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, b, bs0); else { - if (alpha != one) TeamScaleInternal::invoke(member, m, alpha, b, bs0); + if (alpha != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, alpha, b, bs0); if (m <= 0) return 0; InnerTrsmLeftUpperUnitDiag trsm_u(as0, as1, bs0, 0); diff --git a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp index ee14040aed..8c8af6cbd5 100644 --- a/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trtri_Serial_Internal.hpp @@ -108,8 +108,8 @@ SerialTrtriInternalLower::invoke( // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, - as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, + A_ii, A_col_vec, as0, as1); } } return 0; @@ -157,8 +157,8 @@ SerialTrtriInternalUpper::invoke( // SCAL -- x=ax // A((j+1):n,j) = A_ii * A((j+1):n,j) - SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, A_ii, A_col_vec, - as0, as1); + KokkosBlas::Impl::SerialScaleInternal::invoke(A_col_vec_m, A_col_vec_n, + A_ii, A_col_vec, as0, as1); } } return 0; diff --git a/src/batched/sparse/KokkosBatched_CG.hpp b/src/batched/sparse/KokkosBatched_CG.hpp index e1e6b5d6a4..7fa1f7e04b 100644 --- a/src/batched/sparse/KokkosBatched_CG.hpp +++ b/src/batched/sparse/KokkosBatched_CG.hpp @@ -68,12 +68,13 @@ namespace KokkosBatched { template struct CG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const OperatorType &A, const VectorViewType &B, - const VectorViewType &X, - const KrylovHandle - &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const OperatorType &A, + const VectorViewType &B, + const VectorViewType &X, + const KrylovHandleType &handle) { int status = 0; if (std::is_same::value) { status = diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp index 5448c4684c..d7fd94744f 100644 --- a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp +++ b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp @@ -104,89 +104,37 @@ class CrsMatrix { /// \param beta [in]: input coefficient for Y (default value 0.) /// \param Y [in/out]: Output vector Y, a rank 2 view - template + template KOKKOS_INLINE_FUNCTION void apply( const MemberType &member, const XViewType &X, const YViewType &Y, MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), MagnitudeType beta = Kokkos::Details::ArithTraits::zero()) const { - if (beta == 0) - KokkosBatched::Spmv::template invoke< + if (beta == Kokkos::Details::ArithTraits::zero()) + KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 0>( member, alpha, values, row_ptr, colIndices, X, beta, Y); else - KokkosBatched::Spmv::template invoke< + KokkosBatched::TeamVectorSpmv::template invoke< ValuesViewType, IntViewType, XViewType, YViewType, 1>( member, alpha, values, row_ptr, colIndices, X, beta, Y); } - /// \brief apply version that uses variable coefficient alpha and no beta - /// y_l <- alpha_l * A_l * x_l for all l = 1, ..., N - /// where: - /// * N is the number of matrices, - /// * A_1, ..., A_N are N sparse matrices which share the same sparsity - /// pattern, - /// * x_1, ..., x_N are the N input vectors, - /// * y_1, ..., y_N are the N output vectors, - /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N. - /// - /// \tparam MemberType: Input type for the TeamPolicy member - /// \tparam XViewType: Input type for X, needs to be a 2D view - /// \tparam YViewType: Input type for Y, needs to be a 2D view - /// \tparam ArgTrans: Argument for transpose or notranspose - /// \tparam ArgMode: Argument for the parallelism used in the apply - /// - /// \param member [in]: TeamPolicy member - /// \param alpha [in]: input coefficient for X, a rank 1 view - /// \param X [in]: Input vector X, a rank 2 view - /// \param Y [out]: Output vector Y, a rank 2 view - - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, const YViewType &Y, - NormViewType alpha) const { - KokkosBatched::Spmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, NormViewType, - NormViewType, 0>(member, alpha, values, row_ptr, colIndices, X, alpha, - Y); - } - - /// \brief apply version that uses variable coefficients alpha and beta - /// y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N - /// where: - /// * N is the number of matrices, - /// * A_1, ..., A_N are N sparse matrices which share the same sparsity - /// pattern, - /// * x_1, ..., x_N are the N input vectors, - /// * y_1, ..., y_N are the N output vectors, - /// * alpha_1, ..., alpha_N are N scaling factors for x_1, ..., x_N, - /// * beta_1, ..., beta_N are N scaling factors for y_1, ..., y_N. - /// - /// \tparam MemberType: Input type for the TeamPolicy member - /// \tparam XViewType: Input type for X, needs to be a 2D view - /// \tparam YViewType: Input type for Y, needs to be a 2D view - /// \tparam NormViewType: Input type for alpha and beta, needs to be a 1D view - /// \tparam ArgTrans: Argument for transpose or notranspose - /// \tparam ArgMode: Argument for the parallelism used in the apply - /// - /// \param member [in]: TeamPolicy member - /// \param alpha [in]: input coefficient for X, a rank 1 view - /// \param X [in]: Input vector X, a rank 2 view - /// \param beta [in]: input coefficient for Y, a rank 1 view - /// \param Y [in/out]: Output vector Y, a rank 2 view - - template - KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, - const XViewType &X, const YViewType &Y, - const NormViewType &alpha, - const NormViewType &beta) const { - KokkosBatched::Spmv::template invoke< - ValuesViewType, IntViewType, XViewType, YViewType, NormViewType, - NormViewType, 1>(member, alpha, values, row_ptr, colIndices, X, beta, - Y); + template + KOKKOS_INLINE_FUNCTION void apply( + const XViewType &X, const YViewType &Y, + MagnitudeType alpha = Kokkos::Details::ArithTraits::one(), + MagnitudeType beta = + Kokkos::Details::ArithTraits::zero()) const { + if (beta == Kokkos::Details::ArithTraits::zero()) + KokkosBatched::SerialSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 0>( + alpha, values, row_ptr, colIndices, X, beta, Y); + else + KokkosBatched::SerialSpmv::template invoke< + ValuesViewType, IntViewType, XViewType, YViewType, 1>( + alpha, values, row_ptr, colIndices, X, beta, Y); } }; diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp index 512970006b..51efc24aed 100644 --- a/src/batched/sparse/KokkosBatched_GMRES.hpp +++ b/src/batched/sparse/KokkosBatched_GMRES.hpp @@ -60,7 +60,9 @@ /// \param handle [in]: a handle which provides different information such as /// the tolerance or the maximal number of iterations of the solver. +#include #include "KokkosBatched_Krylov_Handle.hpp" +#include "KokkosBatched_GMRES_Serial_Impl.hpp" #include "KokkosBatched_GMRES_Team_Impl.hpp" #include "KokkosBatched_GMRES_TeamVector_Impl.hpp" @@ -68,14 +70,18 @@ namespace KokkosBatched { template struct GMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const OperatorType &A, const VectorViewType &B, - const VectorViewType &X, - const KrylovHandle - &handle) { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const OperatorType &A, + const VectorViewType &B, + const VectorViewType &X, + const KrylovHandleType &handle) { int status = 0; - if (std::is_same::value) { + if (std::is_same::value) { + status = SerialGMRES::template invoke( + A, B, X, handle); + } else if (std::is_same::value) { status = TeamGMRES::template invoke( member, A, B, X, handle); diff --git a/src/batched/sparse/KokkosBatched_Identity.hpp b/src/batched/sparse/KokkosBatched_Identity.hpp index 57934df66a..6613bdd1ec 100644 --- a/src/batched/sparse/KokkosBatched_Identity.hpp +++ b/src/batched/sparse/KokkosBatched_Identity.hpp @@ -60,8 +60,8 @@ class Identity { KOKKOS_INLINE_FUNCTION ~Identity() {} - template + template KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { @@ -76,6 +76,14 @@ class Identity { } } } + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, + const YViewType &Y) const { + if (sameXY == 0) { + SerialCopy::invoke(X, Y); + } + } }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp index 129378ed43..e4bfbefd0f 100644 --- a/src/batched/sparse/KokkosBatched_JacobiPrec.hpp +++ b/src/batched/sparse/KokkosBatched_JacobiPrec.hpp @@ -77,6 +77,8 @@ class JacobiPrec { KOKKOS_INLINE_FUNCTION ~JacobiPrec() {} + KOKKOS_INLINE_FUNCTION void setComputedInverse() { computed_inverse = true; } + template KOKKOS_INLINE_FUNCTION void computeInverse(const MemberType &member) const { auto one = Kokkos::Details::ArithTraits::one(); @@ -141,8 +143,30 @@ class JacobiPrec { computed_inverse = true; } - template + KOKKOS_INLINE_FUNCTION void computeInverse() const { + auto one = Kokkos::Details::ArithTraits::one(); + auto epsilon = Kokkos::Details::ArithTraits::epsilon(); + int tooSmall = 0; + + for (int i = 0; i < n_operators; ++i) + for (int j = 0; j < n_colums; ++j) { + if (Kokkos::abs(diag_values(i, j)) <= epsilon) { + ++tooSmall; + diag_values(i, j) = one; + } else + diag_values(i, j) = one / diag_values(i, j); + } + + if (tooSmall > 0) + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBatched::JacobiPrec: %d entrie(s) has/have a too small " + "magnitude and have been replaced by one, \n", + (int)tooSmall); + computed_inverse = true; + } + + template KOKKOS_INLINE_FUNCTION void apply(const MemberType &member, const XViewType &X, const YViewType &Y) const { @@ -154,6 +178,19 @@ class JacobiPrec { KokkosBatched::HadamardProduct::template invoke< ValuesViewType, XViewType, YViewType>(member, diag_values, X, Y); } + + template + KOKKOS_INLINE_FUNCTION void apply(const XViewType &X, + const YViewType &Y) const { + if (!computed_inverse) { + this->computeInverse(); + } + + KokkosBatched::SerialHadamardProduct::template invoke( + diag_values, X, Y); + } }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp index f14eac7065..3467a6f910 100644 --- a/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp +++ b/src/batched/sparse/KokkosBatched_Krylov_Handle.hpp @@ -42,35 +42,194 @@ //@HEADER */ -#include -#include -#include - #ifndef __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ #define __KOKKOSBATCHED_KRYLOV_HANDLE_HPP__ -//#define VERBOSE + +#include +#include namespace KokkosBatched { /// \brief KrylovHandle /// -/// \tparam scalar_type: Scalar type of the linear solver +/// The handle is used to pass information between the Krylov solver and the +/// calling code. +/// +/// The handle has some views as data member, their required size can be +/// different depending on the used Krylov solver. +/// +/// In the case of the Batched GMRES, the size should be as follows: +/// - Arnoldi_view a batched_size x max_iteration x (n_rows + max_iteration + +/// 3); +/// - tmp_view is NOT used for the team/teamvector GMRES; +/// it is used for the serial GMRES and the size is batched_size x (n_rows + +/// max_iteration + 3); +/// - residual_norms is an optional batched_size x (max_iteration + 2) used to +/// store the convergence history; +/// - iteration_numbers is a 1D view of length batched_size; +/// - first_index and last_index are 1D of length n_teams. +/// +/// \tparam NormViewType: type of the view used to store the convergence history +/// \tparam IntViewType: type of the view used to store the number of iteration +/// per system \tparam ViewType3D: type of the 3D temporary views -template +template class KrylovHandle { public: - using norm_type = - typename Kokkos::Details::ArithTraits::mag_type; + using norm_type = typename NormViewType::non_const_value_type; + + typedef ViewType3D ArnoldiViewType; + typedef Kokkos::View + TemporaryViewType; + + public: + NormViewType residual_norms; + IntViewType iteration_numbers; + typename NormViewType::HostMirror residual_norms_host; + typename IntViewType::HostMirror iteration_numbers_host; + IntViewType first_index; + IntViewType last_index; + ArnoldiViewType Arnoldi_view; + TemporaryViewType tmp_view; private: norm_type tolerance; + norm_type max_tolerance; int max_iteration; + int batched_size; + const int N_team; + int n_teams; + int ortho_strategy; + int scratch_pad_level; + bool compute_last_residual; + bool monitor_residual; + bool host_synchronised; public: - KOKKOS_INLINE_FUNCTION - KrylovHandle() { + KrylovHandle(int _batched_size, int _N_team, int _max_iteration = 200, + bool _monitor_residual = false) + : max_iteration(_max_iteration), + batched_size(_batched_size), + N_team(_N_team), + monitor_residual(_monitor_residual) { tolerance = Kokkos::Details::ArithTraits::epsilon(); - max_iteration = 200; + max_tolerance = 1e-30; + if (std::is_same::value) max_tolerance = 1e-50; + if (monitor_residual) { + residual_norms = NormViewType("", batched_size, max_iteration + 2); + } + iteration_numbers = IntViewType("", batched_size); + Kokkos::deep_copy(iteration_numbers, -1); + + n_teams = ceil(1. * batched_size / N_team); + first_index = IntViewType("", n_teams); + last_index = IntViewType("", n_teams); + + auto first_index_host = Kokkos::create_mirror_view(first_index); + auto last_index_host = Kokkos::create_mirror_view(last_index); + + first_index_host(0) = 0; + last_index_host(0) = N_team; + for (int i = 1; i < n_teams; ++i) { + first_index_host(i) = last_index_host(i - 1); + last_index_host(i) = first_index_host(i) + N_team; + } + last_index_host(n_teams - 1) = batched_size; + + Kokkos::deep_copy(first_index, first_index_host); + Kokkos::deep_copy(last_index, last_index_host); + + // Default Classical GS + ortho_strategy = 1; + scratch_pad_level = 0; + compute_last_residual = true; + host_synchronised = false; + } + + /// \brief get_number_of_systems_per_team + int get_number_of_systems_per_team() { return N_team; } + + /// \brief get_number_of_teams + int get_number_of_teams() { return n_teams; } + + /// \brief reset + /// Reset the iteration numbers to the default value of -1 + /// and the residual norms if monitored. + /// (Usefull when mulitple consecutive solvers use the same handle) + /// + + void reset() { + Kokkos::deep_copy(iteration_numbers, -1); + if (monitor_residual) { + Kokkos::deep_copy(residual_norms, 0.); + } + host_synchronised = false; + } + + /// \brief synchronise_host + /// Synchronise host and device. + /// + + void synchronise_host() { + iteration_numbers_host = Kokkos::create_mirror_view(iteration_numbers); + Kokkos::deep_copy(iteration_numbers_host, iteration_numbers); + if (monitor_residual) { + residual_norms_host = Kokkos::create_mirror_view(residual_norms); + Kokkos::deep_copy(residual_norms_host, residual_norms); + } + host_synchronised = true; + } + + /// \brief is_converged + /// Test if all the systems have converged. + /// + + KOKKOS_INLINE_FUNCTION + bool is_converged() const { + bool all_converged = true; + for (size_t i = 0; i < batched_size; ++i) + if (iteration_numbers(i) == -1) { + all_converged = false; + break; + } + return all_converged; + } + + /// \brief is_converged_host + /// Test if all the systems have converged (host). + /// + + bool is_converged_host() { + if (!host_synchronised) this->synchronise_host(); + bool all_converged = true; + for (int i = 0; i < batched_size; ++i) + if (iteration_numbers_host(i) == -1) { + all_converged = false; + break; + } + return all_converged; + } + + /// \brief is_converged + /// Test if one particular system has converged. + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + bool is_converged(int batched_id) const { + return (iteration_numbers(batched_id) != -1); + } + + /// \brief is_converged + /// Test if one particular system has converged (host). + /// + /// \param batched_id [in]: Global batched ID + + bool is_converged_host(int batched_id) { + if (!host_synchronised) this->synchronise_host(); + return (iteration_numbers_host(batched_id) != -1); } /// \brief set_tolerance @@ -87,21 +246,259 @@ class KrylovHandle { KOKKOS_INLINE_FUNCTION norm_type get_tolerance() const { return tolerance; } + /// \brief set_max_tolerance + /// Set the maximal tolerance of the batched Krylov solver + /// + /// \param _max_tolerance [in]: New tolerance + + KOKKOS_INLINE_FUNCTION + void set_max_tolerance(norm_type _max_tolerance) { + max_tolerance = _max_tolerance; + } + + /// \brief get_max_tolerance + /// Get the maximal tolerance of the batched Krylov solver + + KOKKOS_INLINE_FUNCTION + norm_type get_max_tolerance() const { return max_tolerance; } + /// \brief set_max_iteration /// Set the maximum number of iterations of the batched Krylov solver /// /// \param _max_iteration [in]: New maximum number of iterations KOKKOS_INLINE_FUNCTION - void set_max_iteration(norm_type _max_iteration) { - max_iteration = _max_iteration; - } + void set_max_iteration(int _max_iteration) { max_iteration = _max_iteration; } /// \brief get_max_iteration /// Get the maximum number of iterations of the batched Krylov solver KOKKOS_INLINE_FUNCTION int get_max_iteration() const { return max_iteration; } + + /// \brief get_norm + /// Get the norm of one system at a given iteration + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + norm_type get_norm(int batched_id, int iteration_id) const { + if (monitor_residual) { + return residual_norms(batched_id, iteration_id); + } else + return 0; + } + + /// \brief get_norm_host + /// Get the norm of one system at a given iteration (host) + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + norm_type get_norm_host(int batched_id, int iteration_id) { + if (monitor_residual) { + if (!host_synchronised) this->synchronise_host(); + return residual_norms_host(batched_id, iteration_id); + } else + return 0; + } + + /// \brief get_last_norm + /// Get the last norm of one system + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + norm_type get_last_norm(int batched_id) const { + if (monitor_residual && compute_last_residual) { + return residual_norms(batched_id, max_iteration + 1); + } else + return 0; + } + + /// \brief get_last_norm_host + /// Get the last norm of one system (host) + /// + /// \param batched_id [in]: Global batched ID + + norm_type get_last_norm_host(int batched_id) { + if (monitor_residual && compute_last_residual) { + if (!host_synchronised) this->synchronise_host(); + return residual_norms_host(batched_id, max_iteration + 1); + } else + return 0; + } + + /// \brief get_iteration + /// Get the number of iteration after convergence for one system + /// + /// \param batched_id [in]: Global batched ID + + KOKKOS_INLINE_FUNCTION + int get_iteration(int batched_id) const { + return iteration_numbers(batched_id); + } + + /// \brief get_iteration_host + /// Get the number of iteration after convergence for one system (host) + /// + /// \param batched_id [in]: Global batched ID + + int get_iteration_host(int batched_id) { + if (!host_synchronised) this->synchronise_host(); + return iteration_numbers_host(batched_id); + } + + /// \brief set_ortho_strategy + /// Set the used orthogonalization strategy. + /// Either classical GS (_ortho_strategy=0) or modified GS + /// (_ortho_strategy=1) + /// + /// \param _ortho_strategy [in]: used orthogonalization strategy + + KOKKOS_INLINE_FUNCTION + void set_ortho_strategy(int _ortho_strategy) { + ortho_strategy = _ortho_strategy; + } + + /// \brief get_ortho_strategy + /// Get the used orthogonalization strategy. + /// Either classical GS (_ortho_strategy=0) or modified GS + /// (_ortho_strategy=1) + + KOKKOS_INLINE_FUNCTION + int get_ortho_strategy() const { return ortho_strategy; } + + /// \brief set_scratch_pad_level + /// Set the scratch pad level used to store temporary variables. + /// + /// \param _scratch_pad_level [in]: used level + + KOKKOS_INLINE_FUNCTION + void set_scratch_pad_level(int _scratch_pad_level) { + scratch_pad_level = _scratch_pad_level; + } + + /// \brief get_scratch_pad_level + /// Get the scratch pad level used to store temporary variables. + + KOKKOS_INLINE_FUNCTION + int get_scratch_pad_level() const { return scratch_pad_level; } + + /// \brief set_compute_last_residual + /// Select if the last residual is explicitly computed. + /// + /// \param _compute_last_residual [in]: boolean that specifies if we compute + /// the last residual explicitly + + KOKKOS_INLINE_FUNCTION + void set_compute_last_residual(bool _compute_last_residual) { + if (monitor_residual) + compute_last_residual = _compute_last_residual; + else + compute_last_residual = false; + } + + /// \brief get_compute_last_residual + /// Specify if the last residual has to be computed explicitly. + + KOKKOS_INLINE_FUNCTION + bool get_compute_last_residual() const { + if (monitor_residual) + return compute_last_residual; + else + return false; + } + + private: + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int batched_id, int iteration_id, norm_type norm_i) const { + if (monitor_residual) residual_norms(batched_id, iteration_id) = norm_i; + } + + /// \brief set_norm + /// Store the norm of one of the system at one of the iteration + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_norm(int team_id, int batched_id, int iteration_id, + norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, iteration_id) = norm_i; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(batched_id, max_iteration + 1) = norm_i; + } + + /// \brief set_last_norm + /// Store the last norm of one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param batched_id [in]: Global batched ID + /// \param norm_i [in]: Norm to store + + KOKKOS_INLINE_FUNCTION + void set_last_norm(int team_id, int batched_id, norm_type norm_i) const { + if (monitor_residual) + residual_norms(team_id * N_team + batched_id, max_iteration + 1) = norm_i; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batched_id [in]: Global batched ID + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int batched_id, int iteration_id) const { + iteration_numbers(batched_id) = iteration_id; + } + + /// \brief set_iteration + /// Store the number of iteration after convergence for one system + /// + /// \param batchedteam_id [in]: Team ID + /// \param batched_id [in]: Local batched ID (local ID within the team) + /// \param iteration_id [in]: Iteration ID + + KOKKOS_INLINE_FUNCTION + void set_iteration(int team_id, int batched_id, int iteration_id) const { + iteration_numbers(team_id * N_team + batched_id) = iteration_id; + } + + public: + friend struct SerialGMRES; + template + friend struct TeamGMRES; + template + friend struct TeamVectorGMRES; + + template + friend struct TeamCG; + template + friend struct TeamVectorCG; }; } // namespace KokkosBatched diff --git a/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp new file mode 100644 index 0000000000..413c72678f --- /dev/null +++ b/src/batched/sparse/KokkosBatched_Krylov_Solvers.hpp @@ -0,0 +1,129 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ +#define __KOKKOSBATCHED_KRYLOV_SOLVERS_HPP__ + +namespace KokkosBatched { + +struct SerialGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const int GMRES_id); + template + KOKKOS_INLINE_FUNCTION static int invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamVectorGMRES { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle); + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamCG { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +template +struct TeamVectorCG { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle); +}; + +} // namespace KokkosBatched + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp index 83e8fb90ed..a106d0ae8f 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp @@ -61,139 +61,145 @@ namespace KokkosBatched { /// template -struct TeamVectorCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - - const size_t maximum_iteration = handle.get_max_iteration(); - const MagnitudeType tolerance = handle.get_tolerance(); - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamVectorCopy1D = TeamVectorCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices); - ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices); - ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); - - TeamVectorCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamVectorCopy::invoke(member, _B, R); - - // r_0 := b - A x_0 +template +KOKKOS_INLINE_FUNCTION int TeamVectorCG::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + + const size_t maximum_iteration = handle.get_max_iteration(); + const MagnitudeType tolerance = handle.get_tolerance(); + + using ScratchPadNormViewType = Kokkos::View< + MagnitudeType*, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamVectorCopy1D = TeamVectorCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + + TeamVectorCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamVectorCopy::invoke(member, _B, R); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, R, -1, 1); + member.team_barrier(); + + // Deep copy of r_0 into p_0: + TeamVectorCopy::invoke(member, R, P); + + TeamVectorDot::invoke(member, R, R, sqr_norm_0); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + mask(i) = + sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + }); + + TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); + + int status = 1; + int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + A.template apply(member, P, Q); member.team_barrier(); - A.template apply(member, X, R, -1, 1); + + TeamVectorDot::invoke(member, P, Q, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + alpha(i) = + mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; + }); + member.team_barrier(); + + // x_{j+1} := alpha p_j + x_j + TeamVectorAxpy::invoke(member, alpha, P, X); + member.team_barrier(); + + // r_{j+1} := - alpha q + r_j + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); member.team_barrier(); - // Deep copy of r_0 into p_0: - TeamVectorCopy::invoke(member, R, P); + TeamVectorAxpy::invoke(member, alpha, Q, R); + member.team_barrier(); - TeamVectorDot::invoke(member, R, R, sqr_norm_0); + TeamVectorDot::invoke(member, R, R, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + alpha(i) = + mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); - TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); - - int status = 1; - int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - // q := A p_j - A.template apply(member, P, Q); - member.team_barrier(); - - TeamVectorDot::invoke(member, P, Q, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); - member.team_barrier(); - - // x_{j+1} := alpha p_j + x_j - TeamVectorAxpy::invoke(member, alpha, P, X); - member.team_barrier(); - - // r_{j+1} := - alpha q + r_j - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); - member.team_barrier(); - - TeamVectorAxpy::invoke(member, alpha, Q, R); - member.team_barrier(); - - TeamVectorDot::invoke(member, R, R, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); - - TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); - - // Relative convergence check: - number_not_converged = 0; - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i, int& lnumber_not_converged) { - if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) - ++lnumber_not_converged; - else - mask(i) = 0.; - }, - number_not_converged); - - member.team_barrier(); - - if (number_not_converged == 0) { - status = 0; - break; - } - - // p_{j+1} := alpha p_j + r_{j+1} - TeamVectorXpay::invoke(member, alpha, R, P); - member.team_barrier(); + TeamVectorCopy1D::invoke(member, tmp, sqr_norm_j); + + // Relative convergence check: + number_not_converged = 0; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i, int& lnumber_not_converged) { + if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) + ++lnumber_not_converged; + else + mask(i) = 0.; + }, + number_not_converged); + + member.team_barrier(); + + if (number_not_converged == 0) { + status = 0; + break; } - TeamVectorCopy::invoke(member, X, _X); - return status; + // p_{j+1} := alpha p_j + r_{j+1} + TeamVectorXpay::invoke(member, alpha, R, P); + member.team_barrier(); } -}; + + TeamVectorCopy::invoke(member, X, _X); + return status; +} } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp index 2bc611aa32..cd7a478548 100644 --- a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp @@ -60,139 +60,145 @@ namespace KokkosBatched { /// template -struct TeamCG { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - - size_t maximum_iteration = handle.get_max_iteration(); - const MagnitudeType tolerance = handle.get_tolerance(); - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamCopy1D = TeamCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - ScratchPadVectorViewType P(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType sqr_norm_0(member.team_scratch(0), numMatrices); - ScratchPadNormViewType sqr_norm_j(member.team_scratch(0), numMatrices); - ScratchPadNormViewType alpha(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); - - TeamCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamCopy::invoke(member, _B, R); - - // r_0 := b - A x_0 +template +KOKKOS_INLINE_FUNCTION int TeamCG::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandle& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + + size_t maximum_iteration = handle.get_max_iteration(); + const MagnitudeType tolerance = handle.get_tolerance(); + + using ScratchPadNormViewType = Kokkos::View< + MagnitudeType*, + typename VectorViewType::execution_space::scratch_memory_space>; + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamCopy1D = TeamCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + ScratchPadVectorViewType P( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType Q( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType R( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + ScratchPadVectorViewType X( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + numRows); + + ScratchPadNormViewType sqr_norm_0( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType sqr_norm_j( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType alpha( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType mask( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + ScratchPadNormViewType tmp( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices); + + TeamCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamCopy::invoke(member, _B, R); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, R, -1, 1); + member.team_barrier(); + + // Deep copy of r_0 into p_0: + TeamCopy::invoke(member, R, P); + + TeamDot::invoke(member, R, R, sqr_norm_0); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + mask(i) = + sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + }); + + TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); + + int status = 1; + int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + A.template apply(member, P, Q); member.team_barrier(); - A.template apply( - member, X, R, -1, 1); + + TeamDot::invoke(member, P, Q, tmp); member.team_barrier(); - // Deep copy of r_0 into p_0: - TeamCopy::invoke(member, R, P); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + alpha(i) = + mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; + }); + member.team_barrier(); - TeamDot::invoke(member, R, R, sqr_norm_0); + // x_{j+1} := alpha p_j + x_j + TeamAxpy::invoke(member, alpha, P, X); + member.team_barrier(); + + // r_{j+1} := - alpha q + r_j + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); + member.team_barrier(); + + TeamAxpy::invoke(member, alpha, Q, R); + member.team_barrier(); + + TeamDot::invoke(member, R, R, tmp); member.team_barrier(); Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& i) { - mask(i) = - sqr_norm_0(i) > tolerance * tolerance ? 1. : 0; + alpha(i) = + mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; }); - TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j); - - int status = 1; - int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - // q := A p_j - A.template apply(member, P, Q); - member.team_barrier(); - - TeamDot::invoke(member, P, Q, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? sqr_norm_j(i) / tmp(i) : 0.; - }); - member.team_barrier(); - - // x_{j+1} := alpha p_j + x_j - TeamAxpy::invoke(member, alpha, P, X); - member.team_barrier(); - - // r_{j+1} := - alpha q + r_j - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { alpha(i) = -alpha(i); }); - member.team_barrier(); - - TeamAxpy::invoke(member, alpha, Q, R); - member.team_barrier(); - - TeamDot::invoke(member, R, R, tmp); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - alpha(i) = - mask(i) != 0. ? tmp(i) / sqr_norm_j(i) : 0.; - }); - - TeamCopy1D::invoke(member, tmp, sqr_norm_j); - - // Relative convergence check: - number_not_converged = 0; - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i, int& lnumber_not_converged) { - if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) - ++lnumber_not_converged; - else - mask(i) = 0.; - }, - number_not_converged); - - member.team_barrier(); - - if (number_not_converged == 0) { - status = 0; - break; - } - - // p_{j+1} := alpha p_j + r_{j+1} - TeamXpay::invoke(member, alpha, R, P); - member.team_barrier(); + TeamCopy1D::invoke(member, tmp, sqr_norm_j); + + // Relative convergence check: + number_not_converged = 0; + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i, int& lnumber_not_converged) { + if (sqr_norm_j(i) / sqr_norm_0(i) > tolerance * tolerance) + ++lnumber_not_converged; + else + mask(i) = 0.; + }, + number_not_converged); + + member.team_barrier(); + + if (number_not_converged == 0) { + status = 0; + break; } - TeamCopy::invoke(member, X, _X); - return status; + // p_{j+1} := alpha p_j + r_{j+1} + TeamXpay::invoke(member, alpha, R, P); + member.team_barrier(); } -}; + + TeamCopy::invoke(member, X, _X); + return status; +} + } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp new file mode 100644 index 0000000000..5e4d0aba9b --- /dev/null +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Serial_Impl.hpp @@ -0,0 +1,328 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.4 +// Copyright (2021) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +#ifndef __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ +#define __KOKKOSBATCHED_GMRES_SERIAL_IMPL_HPP__ + +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "KokkosBatched_Util.hpp" + +#include "KokkosBatched_Axpy.hpp" +#include "KokkosBatched_Copy_Decl.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Spmv.hpp" +#include "KokkosBatched_Xpay.hpp" +#include "KokkosBatched_Givens_Serial_Internal.hpp" +#include "KokkosBatched_Trsm_Decl.hpp" +#include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +namespace KokkosBatched { + +/// +/// Serial GMRES +/// + +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const PrecOperatorType& P, + const KrylovHandleType& handle, + const int GMRES_id) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using SerialCopy1D = SerialCopy; + using SerialCopy2D = SerialCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(GMRES_id); + const int last_matrix = handle.last_index(GMRES_id); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_mask = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_mask = offset_W + n_W; + int offset_tmp = offset_mask + n_mask; + + auto G = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto mask = Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), + offset_mask); + auto tmp = + Kokkos::subview(handle.tmp_view, + Kokkos::make_pair(first_matrix, last_matrix), offset_tmp); + + // Deep copy of b into r_0: + SerialCopy2D::invoke(_B, W); + + // r_0 := b - A x_0 + A.template apply(_X, W, -1, 1); + + P.template apply(W, W); + + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(GMRES_id, i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(GMRES_id, i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + } + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + } + } + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(V_j, W); + + P.template apply(W, W); + + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + auto W_l = Kokkos::subview(W, l, Kokkos::ALL); + auto V_old = Kokkos::subview( + V_view, l, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = + Kokkos::subview(H_view, l, j, Kokkos::make_pair(0, (int)j + 1)); + + // Inner products + SerialGemv::invoke( + 1, V_old, W_l, 0, H_old); + + // Update + SerialGemv::invoke( + -1, V_old, H_old, 1, W_l); + } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t i = 0; i < j + 1; ++i) { + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); + SerialDot::invoke(W, V_i, tmp); + SerialCopy1D::invoke(tmp, Kokkos::subview(H_view, Kokkos::ALL, j, i)); + for (OrdinalType ii = 0; ii < numMatrices; ++ii) tmp(ii) = -tmp(ii); + + SerialAxpy::invoke(tmp, V_i, W); + } + } + + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = + H_view(i, j, j + 1) > max_tolerance ? 1. / H_view(i, j, j + 1) : 0.; + } + + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); + for (OrdinalType iRow = 0; iRow < numRows; ++iRow) { + for (OrdinalType iMatrix = 0; iMatrix < numMatrices; ++iMatrix) { + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + } + } + } + + for (OrdinalType l = 0; l < numMatrices; ++l) { + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(GMRES_id, l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(GMRES_id, l, j + 1); + } + } + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } + + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); + + for (OrdinalType l = 0; l < numMatrices; ++l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); + + SerialTrsm::invoke(1, A_l, B_l); + } + + if (handle.get_ortho_strategy() == 0) { + for (OrdinalType l = 0; l < numMatrices; ++l) { + SerialGemv::invoke( + 1, Kokkos::subview(V_view, l, first_indices, Kokkos::ALL), + Kokkos::subview(G, l, first_indices), 1, + Kokkos::subview(_X, l, Kokkos::ALL)); + } + } + if (handle.get_ortho_strategy() == 1) { + for (size_t j = 0; j < maximum_iteration; ++j) { + SerialAxpy::invoke(Kokkos::subview(G, Kokkos::ALL, j), + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), + _X); + } + } + + if (handle.get_compute_last_residual()) { + SerialCopy2D::invoke(_B, W); + A.template apply(_X, W, -1, 1); + P.template apply(W, W); + SerialDot::invoke(W, W, tmp); + + for (OrdinalType i = 0; i < numMatrices; ++i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(GMRES_id, i, tmp(i)); + } + } + return status; +} + +template +KOKKOS_INLINE_FUNCTION int SerialGMRES::invoke(const OperatorType& A, + const VectorViewType& _B, + const VectorViewType& _X, + const KrylovHandleType& handle) { + Identity P; + return invoke(A, _B, _X, P, handle); +} +} // namespace KokkosBatched + +#endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp index 8e45b97556..7fdf244fa7 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp @@ -54,6 +54,7 @@ #include "KokkosBatched_Givens_Serial_Internal.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" namespace KokkosBatched { @@ -64,125 +65,159 @@ namespace KokkosBatched { /// template -struct TeamVectorGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandle& - handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadMultiVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamVectorCopy1D = TeamVectorCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; - const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = 0.; - - ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices, - maximum_iteration + 1, numRows); - ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices, - maximum_iteration + 1, maximum_iteration); - ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices, - maximum_iteration, 2); - ScratchPadVectorViewType G(member.team_scratch(1), numMatrices, - maximum_iteration + 1); - - ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType beta(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); - - TeamVectorCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamVectorCopy::invoke(member, _B, R); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { mask(i) = 1.; }); - - // r_0 := b - A x_0 - member.team_barrier(); - A.template apply(member, X, R, -1, 1); +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamVectorCopy1D = TeamVectorCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + + TeamVectorCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamVectorCopy::invoke(member, _B, W); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + + P.template apply(member, W, W); + member.team_barrier(); + + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); + + member.team_barrier(); // Finish writing to tmp + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + member.team_barrier(); // Finish writing to V + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(member, V_j, W); member.team_barrier(); - P.template apply(member, R, R); + P.template apply(member, W, W); member.team_barrier(); - TeamVectorDot::invoke(member, R, R, beta); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - beta(i) = ATM::sqrt(beta(i)); - G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.; - tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.; - }); - - member.team_barrier(); // Finish writing to tmp - - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix); - }); - - int status = 1; - // int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - member.team_barrier(); // Finish writing to V - // q := A p_j - auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL); - - A.template apply(member, V_j, W); + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); + // Inner products + TeamVectorGemv::invoke(member, 1, V_old, W, 0, + H_old); member.team_barrier(); - P.template apply(member, W, W); + // Update + TeamVectorGemv::invoke(member, -1, V_old, H_old, 1, + W); + member.team_barrier(); // Finish writing to W + } + if (handle.get_ortho_strategy() == 1) { for (size_t i = 0; i < j + 1; ++i) { - member.team_barrier(); // Finish writing to W - auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL); + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamVectorDot::invoke(member, W, V_i, tmp); member.team_barrier(); TeamVectorCopy1D::invoke(member, tmp, - Kokkos::subview(H, Kokkos::ALL, i, j)); - - member.team_barrier(); // Don't start modifying tmp until copy above - // finishes + Kokkos::subview(H_view, Kokkos::ALL, j, i)); + member.team_barrier(); Kokkos::parallel_for( Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); @@ -190,117 +225,161 @@ struct TeamVectorGMRES { member.team_barrier(); // Finish writing to tmp TeamVectorAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W } + } - member.team_barrier(); // Finish writing to W - TeamVectorDot::invoke(member, W, W, tmp); - member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H(i, j + 1, j) = ATM::sqrt(tmp(i)); - tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.; - }); - member.team_barrier(); + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); + member.team_barrier(); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); Kokkos::parallel_for( Kokkos::TeamVectorRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { OrdinalType iRow, iMatrix; getIndices( iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - - Kokkos::parallel_for( - Kokkos::TeamVectorRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = - Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1); - auto tmp2 = - -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens(l, j, 0) = G_new.first; - Givens(l, j, 1) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = - Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1); - auto tmp2 = - -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens(l, j, 1) * G(l, j); - G(l, j) *= Givens(l, j, 0); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - if (mask(l) == 1. && - Kokkos::ArithTraits::abs(G(l, j + 1)) / beta(l) < - tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - } + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); }); + member.team_barrier(); } - member.team_barrier(); // Finish writing to G - Kokkos::parallel_for( Kokkos::TeamVectorRange(member, 0, numMatrices), [&](const OrdinalType& l) { - SerialTrsm::template invoke(1, - Kokkos::subview( - H, l, - Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview( - G, l, - Kokkos::ALL)); + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } }); + member.team_barrier(); + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } + + member.team_barrier(); // Finish writing to G + + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - member.team_barrier(); // Finish writing to G + SerialTrsm::invoke(1, A_l, B_l); + }); + member.team_barrier(); // Finish writing to G + + if (handle.get_ortho_strategy() == 0) { + TeamVectorGemv::invoke( + member, 1, + Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + member.team_barrier(); // Finish writing to X + } + if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { TeamVectorAxpy::invoke( member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X); + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); member.team_barrier(); // Finish writing to X } - - TeamVectorCopy::invoke(member, X, _X); - return status; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { - Identity P; - return invoke(member, A, _B, _X, P, - handle); + TeamVectorCopy::invoke(member, X, _X); + + member.team_barrier(); + + if (handle.get_compute_last_residual()) { + TeamVectorCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamVectorDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); } -}; + return status; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamVectorGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + Identity P; + return invoke(member, A, _B, _X, P, + handle); +} + } // namespace KokkosBatched #endif diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp index 4b4bd06bc0..41ac90e61d 100644 --- a/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp +++ b/src/batched/sparse/impl/KokkosBatched_GMRES_Team_Impl.hpp @@ -54,6 +54,7 @@ #include "KokkosBatched_Givens_Serial_Internal.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Identity.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" namespace KokkosBatched { @@ -63,123 +64,157 @@ namespace KokkosBatched { /// template -struct TeamGMRES { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, const PrecOperatorType& P, - const KrylovHandle& - handle) { - typedef int OrdinalType; - typedef typename Kokkos::Details::ArithTraits< - typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; - typedef Kokkos::Details::ArithTraits ATM; - - using ScratchPadNormViewType = Kokkos::View< - MagnitudeType*, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type**, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using ScratchPadMultiVectorViewType = Kokkos::View< - typename VectorViewType::non_const_value_type***, - typename VectorViewType::array_layout, - typename VectorViewType::execution_space::scratch_memory_space>; - using TeamCopy1D = TeamCopy; - - const OrdinalType numMatrices = _X.extent(0); - const OrdinalType numRows = _X.extent(1); - - size_t maximum_iteration = handle.get_max_iteration() < numRows - ? handle.get_max_iteration() - : numRows; - const MagnitudeType tolerance = handle.get_tolerance(); - const MagnitudeType max_tolerance = 0.; - - ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices, - maximum_iteration + 1, numRows); - ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices, - maximum_iteration + 1, maximum_iteration); - ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices, - maximum_iteration, 2); - ScratchPadVectorViewType G(member.team_scratch(1), numMatrices, - maximum_iteration + 1); - - ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType R(member.team_scratch(0), numMatrices, numRows); - ScratchPadVectorViewType X(member.team_scratch(0), numMatrices, numRows); - - ScratchPadNormViewType beta(member.team_scratch(0), numMatrices); - ScratchPadNormViewType mask(member.team_scratch(0), numMatrices); - ScratchPadNormViewType tmp(member.team_scratch(0), numMatrices); - - TeamCopy::invoke(member, _X, X); - // Deep copy of b into r_0: - TeamCopy::invoke(member, _B, R); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { mask(i) = 1.; }); - - // r_0 := b - A x_0 - member.team_barrier(); - A.template apply( - member, X, R, -1, 1); +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const PrecOperatorType& P, + const KrylovHandleType& handle) { + typedef int OrdinalType; + typedef typename Kokkos::Details::ArithTraits< + typename VectorViewType::non_const_value_type>::mag_type MagnitudeType; + typedef Kokkos::Details::ArithTraits ATM; + + using ScratchPadVectorViewType = Kokkos::View< + typename VectorViewType::non_const_value_type**, + typename VectorViewType::array_layout, + typename VectorViewType::execution_space::scratch_memory_space>; + using TeamCopy1D = TeamCopy; + + const OrdinalType numMatrices = _X.extent(0); + const OrdinalType numRows = _X.extent(1); + + size_t maximum_iteration = handle.get_max_iteration() < numRows + ? handle.get_max_iteration() + : numRows; + const MagnitudeType tolerance = handle.get_tolerance(); + const MagnitudeType max_tolerance = handle.get_max_tolerance(); + + int n_V = numRows; + int n_H = maximum_iteration + 1; + int n_Givens = 2; + + int offset_V = 0; + int offset_H = offset_V + n_V; + int offset_Givens = offset_H + n_H; + + const int first_matrix = handle.first_index(member.league_rank()); + const int last_matrix = handle.last_index(member.league_rank()); + + auto V_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_V, offset_V + n_V)); + auto H_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_H, offset_H + n_H)); + auto Givens_view = Kokkos::subview( + handle.Arnoldi_view, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL, Kokkos::make_pair(offset_Givens, offset_Givens + n_Givens)); + + int n_G = maximum_iteration + 1; + int n_W = numRows; + int n_X = numRows; + int n_mask = 1; + int n_tmp = 1; + + int offset_G = 0; + int offset_W = offset_G + n_G; + int offset_X = offset_W + n_W; + int offset_mask = offset_X + n_X; + int offset_tmp = offset_mask + n_mask; + + ScratchPadVectorViewType tmp_2D( + member.team_scratch(handle.get_scratch_pad_level()), numMatrices, + n_G + n_W + n_X + n_mask + n_tmp); + + auto G = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_G, offset_G + n_G)); + auto W = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_W, offset_W + n_W)); + auto X = Kokkos::subview(tmp_2D, Kokkos::ALL, + Kokkos::make_pair(offset_X, offset_X + n_X)); + auto mask = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_mask); + auto tmp = Kokkos::subview(tmp_2D, Kokkos::ALL, offset_tmp); + + TeamCopy::invoke(member, _X, X); + // Deep copy of b into r_0: + TeamCopy::invoke(member, _B, W); + + // r_0 := b - A x_0 + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + + P.template apply(member, W, W); + member.team_barrier(); + + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_norm(member.league_rank(), i, 0, tmp(i)); + if (tmp(i) > max_tolerance) { + mask(i) = 1; + G(i, 0) = tmp(i); + tmp(i) = 1. / tmp(i); + } else { + handle.set_iteration(member.league_rank(), i, 0); + mask(i) = 0; + G(i, 0) = 0.; + tmp(i) = 0.; + } + }); + + member.team_barrier(); // Finish writing to tmp + + auto V_0 = Kokkos::subview(V_view, Kokkos::ALL, 0, Kokkos::ALL); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), + [&](const OrdinalType& iTemp) { + OrdinalType iRow, iMatrix; + getIndices( + iTemp, numRows, numMatrices, iRow, iMatrix); + V_0(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); + }); + int status = 1; + // int number_not_converged = 0; + + for (size_t j = 0; j < maximum_iteration; ++j) { + member.team_barrier(); // Finish writing to V + // q := A p_j + auto V_j = Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL); + + A.template apply(member, V_j, W); member.team_barrier(); - P.template apply(member, R, R); + P.template apply(member, W, W); member.team_barrier(); - TeamDot::invoke(member, R, R, beta); - member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - beta(i) = ATM::sqrt(beta(i)); - G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.; - tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.; - }); - - member.team_barrier(); // Finish writing to tmp - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), - [&](const OrdinalType& iTemp) { - OrdinalType iRow, iMatrix; - getIndices( - iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, 0, iRow) = R(iMatrix, iRow) * tmp(iMatrix); - }); - - int status = 1; - // int number_not_converged = 0; - - for (size_t j = 0; j < maximum_iteration; ++j) { - member.team_barrier(); // Finish writing to V - // q := A p_j - auto V_j = Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL); - - A.template apply(member, V_j, W); + if (handle.get_ortho_strategy() == 0) { + auto V_old = Kokkos::subview( + V_view, Kokkos::ALL, Kokkos::make_pair(0, (int)j + 1), Kokkos::ALL); + auto H_old = Kokkos::subview(H_view, Kokkos::ALL, j, + Kokkos::make_pair(0, (int)j + 1)); + // Inner products + TeamGemv::invoke( + member, 1, V_old, W, 0, H_old); member.team_barrier(); - P.template apply(member, W, W); + // Update + TeamGemv::invoke( + member, -1, V_old, H_old, 1, W); + member.team_barrier(); // Finish writing to W + } + if (handle.get_ortho_strategy() == 1) { for (size_t i = 0; i < j + 1; ++i) { - member.team_barrier(); // Finish writing to W - auto V_i = Kokkos::subview(V, Kokkos::ALL, i, Kokkos::ALL); + auto V_i = Kokkos::subview(V_view, Kokkos::ALL, i, Kokkos::ALL); TeamDot::invoke(member, W, V_i, tmp); member.team_barrier(); - TeamCopy1D::invoke(member, tmp, Kokkos::subview(H, Kokkos::ALL, i, j)); - member.team_barrier(); // Don't start modifying tmp until copy above - // finishes + TeamCopy1D::invoke(member, tmp, + Kokkos::subview(H_view, Kokkos::ALL, j, i)); + member.team_barrier(); Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); }); @@ -187,117 +222,161 @@ struct TeamGMRES { member.team_barrier(); // Finish writing to tmp TeamAxpy::invoke(member, tmp, V_i, W); + member.team_barrier(); // Finish writing to W } + } - member.team_barrier(); // Finish writing to W - TeamDot::invoke(member, W, W, tmp); - member.team_barrier(); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& i) { - H(i, j + 1, j) = ATM::sqrt(tmp(i)); - tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.; - }); - member.team_barrier(); + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + H_view(i, j, j + 1) = ATM::sqrt(tmp(i)); + tmp(i) = H_view(i, j, j + 1) > max_tolerance + ? 1. / H_view(i, j, j + 1) + : 0.; + }); + member.team_barrier(); + if (j + 1 < maximum_iteration) { + auto V_n = Kokkos::subview(V_view, Kokkos::ALL, j + 1, Kokkos::ALL); Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, numMatrices * numRows), [&](const OrdinalType& iTemp) { OrdinalType iRow, iMatrix; getIndices( iTemp, numRows, numMatrices, iRow, iMatrix); - V(iMatrix, j + 1, iRow) = W(iMatrix, iRow) * tmp(iMatrix); - }); - - Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, numMatrices), - [&](const OrdinalType& l) { - // Apply the previous Givens rotations: - auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j); - - if (mask(l) == 1.) { - for (size_t i = 0; i < j; ++i) { - auto tmp1 = - Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1); - auto tmp2 = - -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1); - H_j(i) = tmp1; - H_j(i + 1) = tmp2; - } - - // Compute the new Givens rotation: - Kokkos::pair - G_new(1, 0); - typename VectorViewType::non_const_value_type alpha = 0; - SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); - - Givens(l, j, 0) = G_new.first; - Givens(l, j, 1) = G_new.second; - - // Apply the new Givens rotation: - auto tmp1 = - Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1); - auto tmp2 = - -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1); - H_j(j) = tmp1; - H_j(j + 1) = tmp2; - - G(l, j + 1) = -Givens(l, j, 1) * G(l, j); - G(l, j) *= Givens(l, j, 0); - } else { - H_j(j) = 1.; - G(l, j + 1) = 0.; - } - - if (mask(l) == 1. && - Kokkos::ArithTraits::abs(G(l, j + 1)) / beta(l) < - tolerance) { - mask(l) = 0.; - G(l, j + 1) = 0.; - } + V_n(iMatrix, iRow) = W(iMatrix, iRow) * tmp(iMatrix); }); + member.team_barrier(); } - member.team_barrier(); // Finish writing to G - Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, numMatrices), [&](const OrdinalType& l) { - SerialTrsm::template invoke(1, - Kokkos::subview( - H, l, - Kokkos::ALL, - Kokkos::ALL), - Kokkos::subview( - G, l, - Kokkos::ALL)); + // Apply the previous Givens rotations: + auto H_j = Kokkos::subview(H_view, l, j, Kokkos::ALL); + auto Givens_0_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 0); + auto Givens_1_l = Kokkos::subview(Givens_view, l, Kokkos::ALL, 1); + + if (mask(l) == 1.) { + for (size_t i = 0; i < j; ++i) { + auto tmp1 = Givens_0_l(i) * H_j(i) + Givens_1_l(i) * H_j(i + 1); + auto tmp2 = -Givens_1_l(i) * H_j(i) + Givens_0_l(i) * H_j(i + 1); + H_j(i) = tmp1; + H_j(i + 1) = tmp2; + } + + // Compute the new Givens rotation: + Kokkos::pair + G_new(1, 0); + typename VectorViewType::non_const_value_type alpha = 0; + SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha); + + Givens_0_l(j) = G_new.first; + Givens_1_l(j) = G_new.second; + + // Apply the new Givens rotation: + auto tmp1 = Givens_0_l(j) * H_j(j) + Givens_1_l(j) * H_j(j + 1); + auto tmp2 = -Givens_1_l(j) * H_j(j) + Givens_0_l(j) * H_j(j + 1); + H_j(j) = tmp1; + H_j(j + 1) = tmp2; + + G(l, j + 1) = -Givens_1_l(j) * G(l, j); + G(l, j) *= Givens_0_l(j); + } else { + H_j(j) = 1.; + G(l, j + 1) = 0.; + } + + auto res_norm = + Kokkos::ArithTraits::abs(G(l, j + 1)) / G(l, 0); + + handle.set_norm(member.league_rank(), l, j + 1, res_norm); + + if (mask(l) == 1. && res_norm < tolerance) { + mask(l) = 0.; + G(l, j + 1) = 0.; + handle.set_iteration(member.league_rank(), l, j + 1); + } }); + member.team_barrier(); + + bool all_converged = true; + for (OrdinalType l = 0; l < numMatrices; ++l) + all_converged = (all_converged && mask(l) == 0.); + if (all_converged) { + maximum_iteration = j + 1; + break; + } + } + + member.team_barrier(); // Finish writing to G + + auto first_indices = Kokkos::make_pair(0, (int)maximum_iteration); + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, numMatrices), + [&](const OrdinalType& l) { + auto A_l = Kokkos::subview(H_view, l, first_indices, first_indices); + auto B_l = Kokkos::subview(G, l, first_indices); - member.team_barrier(); // Finish writing to G + SerialTrsm::invoke(1, A_l, B_l); + }); + member.team_barrier(); // Finish writing to G + + if (handle.get_ortho_strategy() == 0) { + TeamGemv::invoke( + member, 1, + Kokkos::subview(V_view, Kokkos::ALL, first_indices, Kokkos::ALL), + Kokkos::subview(G, Kokkos::ALL, first_indices), 1, X); + member.team_barrier(); // Finish writing to X + } + if (handle.get_ortho_strategy() == 1) { for (size_t j = 0; j < maximum_iteration; ++j) { TeamAxpy::invoke( member, Kokkos::subview(G, Kokkos::ALL, j), - Kokkos::subview(V, Kokkos::ALL, j, Kokkos::ALL), X); + Kokkos::subview(V_view, Kokkos::ALL, j, Kokkos::ALL), X); member.team_barrier(); // Finish writing to X } - - TeamCopy::invoke(member, X, _X); - return status; } - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType& member, const OperatorType& A, const VectorViewType& _B, - const VectorViewType& _X, - const KrylovHandle& - handle) { - Identity P; - return invoke(member, A, _B, _X, P, - handle); + TeamCopy::invoke(member, X, _X); + + member.team_barrier(); + + if (handle.get_compute_last_residual()) { + TeamCopy::invoke(member, _B, W); + member.team_barrier(); + A.template apply(member, X, W, -1, 1); + member.team_barrier(); + P.template apply(member, W, W); + member.team_barrier(); + TeamDot::invoke(member, W, W, tmp); + member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices), + [&](const OrdinalType& i) { + tmp(i) = ATM::sqrt(tmp(i)); + handle.set_last_norm(member.league_rank(), i, + tmp(i)); + }); } -}; + return status; +} + +template +template +KOKKOS_INLINE_FUNCTION int TeamGMRES::invoke( + const MemberType& member, const OperatorType& A, const VectorViewType& _B, + const VectorViewType& _X, const KrylovHandleType& handle) { + Identity P; + return invoke(member, A, _B, _X, P, + handle); +} + } // namespace KokkosBatched #endif diff --git a/src/blas/KokkosBlas1_axpby.hpp b/src/blas/KokkosBlas1_axpby.hpp index cae0cc7102..e8b79df565 100644 --- a/src/blas/KokkosBlas1_axpby.hpp +++ b/src/blas/KokkosBlas1_axpby.hpp @@ -46,6 +46,7 @@ #define KOKKOSBLAS1_AXPBY_HPP_ #include +#include #include #include @@ -124,6 +125,32 @@ void axpy(const AV& a, const XMV& X, const YMV& Y) { Y); } +/// +/// Serial axpy on device +/// +template +KOKKOS_FUNCTION void serial_axpy(const scalar_type alpha, const XMV X, YMV Y) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_axpy: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_axpy: YMV is not a Kokkos::View"); + static_assert(XMV::Rank == 1 || XMV::Rank == 2, + "KokkosBlas::serial_axpy: XMV must have rank 1 or 2."); + static_assert( + XMV::Rank == YMV::Rank, + "KokkosBlas::serial_axpy: XMV and YMV must have the same rank."); + + if (X.extent(0) != Y.extent(0) || X.extent(1) != Y.extent(1)) { + Kokkos::abort("KokkosBlas::serial_axpy: X and Y dimensions do not match"); + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + return Impl::serial_axpy_mv(X.extent(0), X.extent(1), alpha, X.data(), + Y.data(), X.stride_0(), X.stride_1(), + Y.stride_0(), Y.stride_1()); +} + } // namespace KokkosBlas #endif diff --git a/src/blas/KokkosBlas1_nrm2.hpp b/src/blas/KokkosBlas1_nrm2.hpp index 3a10e48a4d..bbe231e795 100644 --- a/src/blas/KokkosBlas1_nrm2.hpp +++ b/src/blas/KokkosBlas1_nrm2.hpp @@ -46,6 +46,7 @@ #define KOKKOSBLAS1_NRM2_HPP_ #include +#include #include #include @@ -156,6 +157,63 @@ void nrm2(const RV& R, const XMV& X, Impl::Nrm2::nrm2(R_internal, X_internal, true); } + +/// +/// Serial nrm2 +/// +template +KOKKOS_INLINE_FUNCTION typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type +serial_nrm2(const XMV X) { +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(XMV::Rank == 1, + "KokkosBlas::serial_nrm2: XMV must have rank 1"); +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + return Impl::serial_nrm2(X.extent(0), X.data(), X.stride_0()); +} + +template +KOKKOS_INLINE_FUNCTION int serial_nrm2(const XMV X, const RV& R) { +// Do some compile time check when debug is enabled +#if (KOKKOSKERNELS_DEBUG_LEVEL > 0) + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: XMV is not a Kokkos::View"); + static_assert(Kokkos::is_view::value, + "KokkosBlas::serial_nrm2: RV is not a Kokkos::View"); + static_assert(std::is_same::value, + "KokkosBlas::serial_nrm2: R is const. " + "It must be nonconst, because it is an output argument " + "(we have to be able to write to its entries)."); + static_assert(((RV::rank == 0) && (XMV::rank == 1)) || + ((RV::rank == 1) && (XMV::rank == 2)), + "KokkosBlas::serial_nrm2: " + "RV and XMV must either have rank 0 and 1 or rank 1 and 2."); + + using norm_type = typename Kokkos::Details::InnerProductSpaceTraits< + typename XMV::non_const_value_type>::mag_type; + static_assert( + std::is_same::value, + "KokkosBlas::serial_nrm2: RV must have same value_type as" + " Kokkos::ArithTraits::mag_type"); + + if (R.extent(0) != X.extent(1)) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "KokkosBlas::serial_nrm2 (MV): Dimensions of R and X do not match," + " R: %d and X: %d x %d.\n", + R.extent_int(0), X.extent_int(0), X.extent_int(1)); + return 1; + } +#endif // KOKKOSKERNELS_DEBUG_LEVEL + + Impl::serial_nrm2(X.extent(0), X.extent(1), X.data(), X.stride_0(), + X.stride_1(), R.data(), R.stride_0()); + return 0; +} + } // namespace KokkosBlas #endif // KOKKOSBLAS1_NRM2_HPP_ diff --git a/src/blas/KokkosBlas1_scal.hpp b/src/blas/KokkosBlas1_scal.hpp index 2fc4f92f58..d533efe535 100644 --- a/src/blas/KokkosBlas1_scal.hpp +++ b/src/blas/KokkosBlas1_scal.hpp @@ -46,9 +46,15 @@ #define KOKKOSBLAS1_SCAL_HPP_ #include +#include +#include #include #include +/// +/// General/Host Scale +/// + namespace KokkosBlas { template @@ -108,6 +114,51 @@ void scal(const RMV& R, const AV& a, const XMV& X) { R_internal, a_internal, X_internal); } +/// +/// Serial Scale +/// + +struct SerialScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType& A) { + return Impl::SerialScaleInternal::invoke( + A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + } +}; + +/// +/// Team Scale +/// + +template +struct TeamScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamScaleInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +/// +/// TeamVector Scale +/// + +template +struct TeamVectorScale { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& member, + const ScalarType alpha, + const AViewType& A) { + return Impl::TeamVectorScaleInternal::invoke(member, A.extent(0), + A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_1()); + } +}; + } // namespace KokkosBlas #endif diff --git a/src/blas/KokkosBlas1_set.hpp b/src/blas/KokkosBlas1_set.hpp new file mode 100644 index 0000000000..61c03ec17a --- /dev/null +++ b/src/blas/KokkosBlas1_set.hpp @@ -0,0 +1,99 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS1_SET_HPP_ +#define KOKKOSBLAS1_SET_HPP_ + +#include + +namespace KokkosBlas { + +/// +/// Serial Set +/// + +struct SerialSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const ScalarType alpha, + const AViewType &A) { + return Impl::SerialSetInternal::invoke( + A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1()); + } +}; + +/// +/// Team Set +/// + +template +struct TeamSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A) { + return Impl::TeamSetInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +/// +/// TeamVector Set +/// + +template +struct TeamVectorSet { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType &member, + const ScalarType alpha, + const AViewType &A) { + return Impl::TeamVectorSetInternal::invoke(member, A.extent(0), A.extent(1), + alpha, A.data(), A.stride_0(), + A.stride_1()); + } +}; + +} // namespace KokkosBlas + +#endif diff --git a/src/blas/KokkosBlas_trtri.hpp b/src/blas/KokkosBlas_trtri.hpp index 0402b11104..afcc05d5ae 100644 --- a/src/blas/KokkosBlas_trtri.hpp +++ b/src/blas/KokkosBlas_trtri.hpp @@ -129,7 +129,7 @@ int trtri(const char uplo[], const char diag[], const AViewType& A) { // This is the return value type and should always reside on host using RViewInternalType = - Kokkos::View >; int result; diff --git a/src/blas/impl/KokkosBlas1_dot_impl.hpp b/src/blas/impl/KokkosBlas1_dot_impl.hpp index cb8db757f8..5430e0177b 100644 --- a/src/blas/impl/KokkosBlas1_dot_impl.hpp +++ b/src/blas/impl/KokkosBlas1_dot_impl.hpp @@ -83,7 +83,7 @@ struct DotFunctor { Kokkos::Details::updateDot(sum, m_x(i), m_y(i)); // sum += m_x(i) * m_y(i) } - KOKKOS_INLINE_FUNCTION void init(volatile value_type& update) const { + KOKKOS_INLINE_FUNCTION void init(value_type& update) const { update = Kokkos::Details::ArithTraits::zero(); } @@ -91,11 +91,6 @@ struct DotFunctor { const value_type& source) const { update += source; } - - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } }; } // namespace Impl diff --git a/src/blas/impl/KokkosBlas1_iamax_impl.hpp b/src/blas/impl/KokkosBlas1_iamax_impl.hpp index dc30edf7da..8b27b3e5a3 100644 --- a/src/blas/impl/KokkosBlas1_iamax_impl.hpp +++ b/src/blas/impl/KokkosBlas1_iamax_impl.hpp @@ -96,13 +96,6 @@ struct V_Iamax_Functor { update = Kokkos::reduction_identity::max() + 1; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - mag_type source_val = IPT::norm(m_x(source - 1)); - mag_type update_val = IPT::norm(m_x(update - 1)); - if (update_val < source_val) update = source; - } - KOKKOS_INLINE_FUNCTION void join(value_type& update, const value_type& source) const { mag_type source_val = IPT::norm(m_x(source - 1)); diff --git a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp index f2b0e826bc..e56a884655 100644 --- a/src/blas/impl/KokkosBlas1_nrm2_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2_impl.hpp @@ -105,11 +105,6 @@ struct V_Nrm2_Functor { update += source; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } - KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = diff --git a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp index 3f202ca430..e2c858f0b3 100644 --- a/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp +++ b/src/blas/impl/KokkosBlas1_nrm2w_impl.hpp @@ -108,11 +108,6 @@ struct V_Nrm2w_Functor { update += source; } - KOKKOS_INLINE_FUNCTION void join(volatile value_type& update, - const volatile value_type& source) const { - update += source; - } - KOKKOS_INLINE_FUNCTION void final(value_type& update) const { if (m_take_sqrt) update = diff --git a/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp new file mode 100644 index 0000000000..bb411ef4a5 --- /dev/null +++ b/src/blas/impl/KokkosBlas1_serial_scal_impl.hpp @@ -0,0 +1,86 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_ +#define KOKKOSBLAS1_SERIAL_SCAL_IMPL_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +struct SerialScaleInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, + const int as0) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) A[i * as0] *= alpha; + + return 0; + } + + template + KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, + const ScalarType alpha, + /* */ ValueType *KOKKOS_RESTRICT A, + const int as0, const int as1) { + if (as0 > as1) + for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); + else + for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0); + + return 0; + } +}; + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp b/src/blas/impl/KokkosBlas1_set_impl.hpp similarity index 66% rename from src/batched/dense/impl/KokkosBatched_Set_Internal.hpp rename to src/blas/impl/KokkosBlas1_set_impl.hpp index f18ac4355c..a3870a2e15 100644 --- a/src/batched/dense/impl/KokkosBatched_Set_Internal.hpp +++ b/src/blas/impl/KokkosBlas1_set_impl.hpp @@ -1,11 +1,56 @@ -#ifndef __KOKKOSBATCHED_SET_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SET_INTERNAL_HPP__ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBLAS_SET_IMPL_HPP__ +#define __KOKKOSBLAS_SET_IMPL_HPP__ /// \author Kyungjoo Kim (kyukim@sandia.gov) -#include "KokkosBatched_Util.hpp" +#include "Kokkos_Core.hpp" -namespace KokkosBatched { +namespace KokkosBlas { +namespace Impl { /// /// Serial Internal Impl @@ -115,6 +160,7 @@ struct TeamVectorSetInternal { } }; -} // end namespace KokkosBatched +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp similarity index 59% rename from src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp rename to src/blas/impl/KokkosBlas1_team_scal_impl.hpp index 6f313ea919..6f4fdf40b0 100644 --- a/src/batched/dense/impl/KokkosBatched_Scale_Internal.hpp +++ b/src/blas/impl/KokkosBlas1_team_scal_impl.hpp @@ -1,41 +1,55 @@ -#ifndef __KOKKOSBATCHED_SCALE_INTERNAL_HPP__ -#define __KOKKOSBATCHED_SCALE_INTERNAL_HPP__ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ -/// \author Kyungjoo Kim (kyukim@sandia.gov) +#ifndef KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_ +#define KOKKOSBLAS1_TEAM_SCAL_IMPL_HPP_ -#include "KokkosBatched_Util.hpp" +#include +#include "KokkosBlas1_serial_scal_impl.hpp" -namespace KokkosBatched { - -/// -/// Serial Internal Impl -/// ==================== -struct SerialScaleInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0) { -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (int i = 0; i < m; ++i) A[i * as0] *= alpha; - - return 0; - } - - template - KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n, - const ScalarType alpha, - /* */ ValueType *KOKKOS_RESTRICT A, - const int as0, const int as1) { - if (as0 > as1) - for (int i = 0; i < m; ++i) invoke(n, alpha, A + i * as0, as1); - else - for (int j = 0; j < n; ++j) invoke(m, alpha, A + j * as1, as0); - - return 0; - } -}; +namespace KokkosBlas { +namespace Impl { /// /// Team Internal Impl @@ -115,6 +129,7 @@ struct TeamVectorScaleInternal { } }; -} // namespace KokkosBatched +} // namespace Impl +} // namespace KokkosBlas #endif diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 6f27363be9..a6c8111684 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -64,8 +64,9 @@ struct SingleLevelNontransposeGEMV { using BetaCoeffType = typename YViewType::non_const_value_type; using y_value_type = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; SingleLevelNontransposeGEMV(const AlphaCoeffType& alpha, const AViewType& A, const XViewType& x, const BetaCoeffType& beta, @@ -146,8 +147,9 @@ struct SingleLevelTransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; typedef AccumScalar value_type[]; IndexType value_count; // Kokkos needs this for reductions w/ array results @@ -188,8 +190,7 @@ struct SingleLevelTransposeGEMV { } } - KOKKOS_INLINE_FUNCTION void join(volatile value_type dst, - const volatile value_type src) const { + KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const { for (IndexType j = 0; j < value_count; ++j) { dst[j] += src[j]; } @@ -479,8 +480,9 @@ struct TwoLevelGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -600,8 +602,9 @@ struct TwoLevelTransposeGEMV { using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; using AccumScalar = typename std::conditional< - std::is_same::value, float, - y_value_type>::type; + std::is_same::value || + std::is_same::value, + float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -739,7 +742,8 @@ void twoLevelGemv(const typename AViewType::execution_space& space, tagged_policy team; if (isLayoutLeft) { using AccumScalar = typename std::conditional< - std::is_same::value, + std::is_same::value || + std::is_same::value, float, y_value_type>::type; size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; diff --git a/src/blas/impl/KokkosBlas3_trmm_impl.hpp b/src/blas/impl/KokkosBlas3_trmm_impl.hpp index 56bc2ba806..2ba3363264 100644 --- a/src/blas/impl/KokkosBlas3_trmm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trmm_impl.hpp @@ -53,8 +53,6 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" -#include "KokkosBatched_Set_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" #include "KokkosBatched_Trmm_Decl.hpp" #include "KokkosBatched_Trmm_Serial_Impl.hpp" diff --git a/src/blas/impl/KokkosBlas3_trsm_impl.hpp b/src/blas/impl/KokkosBlas3_trsm_impl.hpp index b215633093..d85b850138 100644 --- a/src/blas/impl/KokkosBlas3_trsm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_trsm_impl.hpp @@ -54,6 +54,7 @@ #include "KokkosKernels_config.h" #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" +#include "KokkosBlas1_set_impl.hpp" #include "KokkosBatched_Trsm_Decl.hpp" #include "KokkosBatched_Trsm_Serial_Impl.hpp" @@ -72,10 +73,10 @@ int SerialTrsmInternalLeftLowerConj(const bool use_unit_diag, const int m, const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) - KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; for (int p = 0; p < m; ++p) { @@ -111,10 +112,10 @@ int SerialTrsmInternalLeftUpperConj(const bool use_unit_diag, const int m, const ScalarType one(1.0), zero(0.0); if (alpha == zero) - KokkosBatched::SerialSetInternal ::invoke(m, n, zero, B, bs0, bs1); + SerialSetInternal::invoke(m, n, zero, B, bs0, bs1); else { if (alpha != one) - KokkosBatched::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); + KokkosBlas::Impl::SerialScaleInternal::invoke(m, n, alpha, B, bs0, bs1); if (m <= 0 || n <= 0) return 0; ValueType* KOKKOS_RESTRICT B0 = B; diff --git a/src/blas/impl/KokkosBlas_Newton_impl.hpp b/src/blas/impl/KokkosBlas_Newton_impl.hpp new file mode 100644 index 0000000000..a8a8973d41 --- /dev/null +++ b/src/blas/impl/KokkosBlas_Newton_impl.hpp @@ -0,0 +1,240 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef __KOKKOSBATCHED_ODE_NEWTON_HPP__ +#define __KOKKOSBATCHED_ODE_NEWTON_HPP__ + +#include "Kokkos_Core.hpp" +#include "KokkosBatched_LU_Decl.hpp" +#include "KokkosBatched_LU_Serial_Impl.hpp" +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBlas1_nrm2.hpp" +#include "KokkosBlas1_scal.hpp" +#include "KokkosBlas1_axpby.hpp" + +namespace KokkosBlas { +namespace Impl { + +enum class NewtonSolverStatus { Converged = 0, LinearSolveFailure, MaxIters }; + +std::ostream& operator<<(std::ostream& os, NewtonSolverStatus& status) { + switch (status) { + case NewtonSolverStatus::Converged: os << "Newton Solver Converged!"; break; + case NewtonSolverStatus::LinearSolveFailure: + os << "Newton: Linear Solver Failure"; + break; + case NewtonSolverStatus::MaxIters: + os << "Newton reached maximum iterations without convergence."; + break; + } + return os; +} + +/// \brief NewtonHandle +/// +/// This handle is used to pass information between the Newton Solver and +/// the calling code. +/// +/// \tparam: NormViewType: Type of view used to store the residual convergence +/// history + +template +struct NewtonHandle { + using norm_type = typename NormViewType::non_const_value_type; + + NormViewType lastResidual; // Residual of last successful iteration + typename NormViewType::HostMirror lastResidualHost; + + // NormViewType residual_norms; + // TODO: Making these public for now. Should make private and access + // via setters and getters? + int maxIters; // Maximum number of Newton steps + norm_type relativeTol; // Relative convergence tolerance + bool debug_mode; // Returns extra verbose output if true. + + NewtonHandle(int _maxIters = 25, double _relativeTol = 1.0e-6, + bool _debug = false) + : lastResidual("ending Residual norm", 1), + lastResidualHost("end res norm host", 1), + maxIters(_maxIters), + relativeTol(_relativeTol), + debug_mode(_debug) {} + + KOKKOS_FUNCTION + void set_residual(const norm_type val) const { lastResidual(0) = val; } + + KOKKOS_FUNCTION + norm_type get_residual() const { return lastResidual(0); } + + norm_type get_residual_host() const { + Kokkos::deep_copy(lastResidualHost, lastResidual); + return lastResidualHost(0); + } + +}; // NewtonHandle + +/// \brief Newton Functor: +/// Solves the nonlinear system F(x) = 0 +/// where F is a map from R^n to R^n. +/// \tparam System: Struct that allows the evaluation +/// of the residual and jacobian using the +/// residual() and jacobian() methods. +/// \tparam Matrix: rank-2 view-type +/// \tparam XVector: rank-1 view-type +/// \tparam YVector: rank-1 view-type +/// \param +/// \param X [in]: Input vector X, a rank 1 view +/// \param Y [in/out]: Output vector Y, a rank 1 view +/// +/// No nested parallel_for is used inside of the function. +/// +template +struct NewtonFunctor { + using execution_space = typename YVector::execution_space; + using yvalue_type = typename YVector::non_const_value_type; + using norm_type = typename NewtonHandleType::norm_type; + + System sys; + XVector x; + YVector rhs; + NewtonHandleType handle; + + Matrix J, tmp; + XVector update; + + NewtonFunctor(System _sys, XVector _x, YVector _rhs, + NewtonHandleType& _handle) + : sys(_sys), x(_x), rhs(_rhs), handle(_handle) { + J = Matrix("Jacobian", x.extent(0), x.extent(0)); + tmp = Matrix("Jacobian", x.extent(0), x.extent(0) + 4); + update = XVector("update", x.extent(0)); + } + + KOKKOS_INLINE_FUNCTION + NewtonSolverStatus solve() const { + norm_type norm = Kokkos::ArithTraits::zero(); + yvalue_type alpha = Kokkos::ArithTraits::one(); + handle.set_residual(-1); // init to dummy value + + // Iterate until maxIts or the tolerance is reached + for (int it = 0; it < handle.maxIters; ++it) { + // compute initial rhs + sys.residual(x, rhs); + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("NewtonFunctor: r="); + for (int k = 0; k < rhs.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", rhs(k)); + } + } + + // Solve the following linearized + // problem at each step: J*update=-rhs + // with J=du/dx, rhs=f(u_n+update)-f(u_n) + norm = KokkosBlas::serial_nrm2(rhs); + handle.set_residual(norm); + + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Iteration: %d Current res norm is: %e \n Current " + "soln is:\n", + it, (double)handle.get_residual()); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + + if (norm < handle.relativeTol) { + // Problem solved, exit the functor + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Newton solver converged! Ending norm is: %e \n " + "Solution x is: " + "\n", + norm); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + return NewtonSolverStatus::Converged; + } + + // compute LHS + sys.jacobian(x, J); + + // solve linear problem + int linSolverStat = KokkosBatched::SerialGesv< + KokkosBatched::Gesv::StaticPivoting>::invoke(J, update, rhs, tmp); + KokkosBlas::SerialScale::invoke(-1, update); + + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Print linear solve solution: \n"); + for (int k = 0; k < update.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", update(k)); + } + } + if (linSolverStat == 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Linear solve gesv returned failure! \n"); + return NewtonSolverStatus::LinearSolveFailure; + } + + // update solution // x = x + alpha*update + KokkosBlas::serial_axpy(alpha, update, x); + if (handle.debug_mode) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "NewtonFunctor: Print updated solution: \n"); + for (int k = 0; k < x.extent_int(0); k++) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f \n", x(k)); + } + } + } + return NewtonSolverStatus::MaxIters; + } // End solve functor. +}; + +} // namespace Impl +} // namespace KokkosBlas +#endif // __KOKKOSBATCHED_ODE_NEWTON_HPP__ diff --git a/src/blas/impl/KokkosBlas_serial_axpy.hpp b/src/blas/impl/KokkosBlas_serial_axpy.hpp new file mode 100644 index 0000000000..f9cc918650 --- /dev/null +++ b/src/blas/impl/KokkosBlas_serial_axpy.hpp @@ -0,0 +1,88 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_ +#define KOKKOSBLAS_SERIAL_AXPY_IMPL_HPP_ + +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +template +KOKKOS_INLINE_FUNCTION static void serial_axpy( + const int m, const ScalarType alpha, const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int ys0) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) Y[i * ys0] += alpha * X[i * xs0]; + + return; +} + +template +KOKKOS_INLINE_FUNCTION static void serial_axpy_mv( + const int m, const int n, const ScalarType alpha, + const ValueType *KOKKOS_RESTRICT X, + /* */ ValueType *KOKKOS_RESTRICT Y, const int xs0, const int xs1, + const int ys0, const int ys1) { + if (xs0 > xs1) { + for (int i = 0; i < m; ++i) + serial_axpy(n, alpha, X + i * xs0, Y + i * ys0, xs1, ys1); + } else { + for (int j = 0; j < n; ++j) + serial_axpy(m, alpha, X + j * xs1, Y + j * ys1, xs0, ys0); + } + + return; +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/src/blas/impl/KokkosBlas_serial_nrm2.hpp b/src/blas/impl/KokkosBlas_serial_nrm2.hpp new file mode 100644 index 0000000000..9397dc5020 --- /dev/null +++ b/src/blas/impl/KokkosBlas_serial_nrm2.hpp @@ -0,0 +1,92 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS_SERIAL_NRM2_HPP_ +#define KOKKOSBLAS_SERIAL_NRM2_HPP_ + +#include +#include + +namespace KokkosBlas { +namespace Impl { + +/// +/// Serial Internal Impl +/// ==================== +template +KOKKOS_INLINE_FUNCTION static + typename Kokkos::Details::InnerProductSpaceTraits::mag_type + serial_nrm2(const int m, const ValueType *KOKKOS_RESTRICT X, + const int xs0) { + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + + norm_type nrm = Kokkos::ArithTraits::zero(); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int i = 0; i < m; ++i) + nrm += IPT::norm(IPT::dot(X[i * xs0], X[i * xs0])); + + return Kokkos::ArithTraits::sqrt(nrm); +} + +template +KOKKOS_INLINE_FUNCTION static void serial_nrm2( + const int m, const int n, const ValueType *KOKKOS_RESTRICT X, const int xs0, + const int xs1, + typename Kokkos::Details::InnerProductSpaceTraits::mag_type + *KOKKOS_RESTRICT R, + const int ys0) { + for (int vecIdx = 0; vecIdx < n; ++vecIdx) + R[vecIdx * ys0] = serial_nrm2(m, X + vecIdx * xs1, xs0); + + return; +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif // KOKKOSBLAS_SERIAL_NRM2_HPP_ diff --git a/src/blas/impl/KokkosBlas_trtri_spec.hpp b/src/blas/impl/KokkosBlas_trtri_spec.hpp index 1cccad1ea4..0bbeb294dc 100644 --- a/src/blas/impl/KokkosBlas_trtri_spec.hpp +++ b/src/blas/impl/KokkosBlas_trtri_spec.hpp @@ -69,7 +69,7 @@ struct trtri_eti_spec_avail { MEM_SPACE) \ template <> \ struct trtri_eti_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ @@ -136,7 +136,7 @@ struct TRTRI { // #define KOKKOSBLAS_TRTRI_ETI_SPEC_DECL(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ extern template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -144,7 +144,7 @@ struct TRTRI { #define KOKKOSBLAS_TRTRI_ETI_SPEC_INST(SCALAR, LAYOUTA, EXEC_SPACE, MEM_SPACE) \ template struct TRTRI< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/src/kokkoskernels_eti.cmake b/src/cmake/kokkoskernels_eti.cmake similarity index 100% rename from src/kokkoskernels_eti.cmake rename to src/cmake/kokkoskernels_eti.cmake diff --git a/src/common/KokkosKernels_BlockHashmapAccumulator.hpp b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp new file mode 100644 index 0000000000..576060cf75 --- /dev/null +++ b/src/common/KokkosKernels_BlockHashmapAccumulator.hpp @@ -0,0 +1,660 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP +#define _KOKKOSKERNELS_BLOCKHASHMAPACCUMULATOR_HPP +#include +#include +#include "KokkosKernels_BlockUtils.hpp" +#include "KokkosKernels_HashmapAccumulator.hpp" + +//#define HASHMAPACCUMULATOR_ASSERT_ENABLED + +namespace KokkosKernels { + +namespace Experimental { + +template +/** + * \brief BlockHashmapAccumulator class + * The use of this is described in the paper: + * "Performance-portable sparse matrix-matrix multiplication for many-core + * architectures" ( https://ieeexplore.ieee.org/abstract/document/7965111/ ) in + * section III.D + * + * Public members: + * \var hash_begins: Holds the beginning indices of the linked lists + * corresponding to hash values [Begins] + * \var hash_nexts: Holds the indicies of the next elements + * within the linked list [Nexts] + * \var keys: This stores the column indices of the crs matrix [Ids] + * \var values: This store the numerical values (matrix elements) [Values] + * + * Private members: + * \var __max_value_size: The length of the two arrays (keys and hash_nexts) + * \var __hashOpRHS: The right hand side of the requested hash operation. + * \var __insert_success: Value to return upon insertion success. + * \var __insert_full: Value to return upon insertion failure. + */ +struct BlockHashmapAccumulator { + // begin public members + // issue-508, TODO: It's best for used_size to be an internal member of this + // class but the current use-cases rely on used_size to be a parameter to the + // below insertion routines. One way to remove used_size as a parameter to the + // insertion routines is to instantiate multiple BlockHashmapAccumulator + // objects (one hashmap for each team of threads) instead of using a single + // BlockHashmapAccumulator object for multiple teams of threads; this entails + // major refactoring throughout the kokkos-kernels code base. + // Making used_size a pointer and private member of this + // class still exposes access to this member outside of the class and is + // not a good option. + // size_type used_size; + + // issue-508, TODO: The hash_begins, hash_nexts, keys, values, + // __insert_success, and __insert_full members should all be private as well. + // They should be managed solely by this BlockHashmapAccumulator class: + // initialized in the constructor(s) and only managed by + // BlockHashmapAccumulator insertion routines. Making these members private + // requires major refactoring throughout the kokkos-kernels code base. If + // allocations for these members must really live outside this class, we need + // new members that break + // __max_value_size into: hash_begins_len, hash_nexts_len, keys_len, and + // values_len...! + + size_type *hash_begins; + size_type *hash_nexts; + key_type *keys; + value_type *values; + const size_type block_dim; + const size_type block_size; + + /** + * \brief default constructor BlockHashmapAccumulator + * Sets used_size to 0, __insert_success to 0, __insert_full to 1, and + * __hashOpRHS to 0. + * + * Assumption: hash_begins_ are all initialized to -1. + */ + KOKKOS_INLINE_FUNCTION + BlockHashmapAccumulator() + : hash_begins(), + hash_nexts(), + keys(), + values(), + __max_value_size(), + __hashOpRHS(0) {} + + /** + * \brief parameterized constructor BlockHashmapAccumulator + * Sets used_size to 0, __insert_success to 0, and __insert_full to 1. + * + * \param max_value_size_: The length of the two arrays (keys and hash_nexts) + * \param hashOpRHS: The right hand side of the requested hash + * operation. \param hash_begins_: Holds the beginning indices of the + * linked lists corresponding to hash values [Begins] \param hash_nexts_: + * Holds the indicies of the next elements within the linked list [Nexts] + * \param keys_: This stores the column indices of (??) [Ids] + * \param values_: This store the (matrix element?) numerical value of + * (??) [Values] + * + * Assumption: hash_begins_ are all initialized to -1. + */ + KOKKOS_INLINE_FUNCTION + BlockHashmapAccumulator(size_type block_dim_, const size_type max_value_size_, + const size_type hashOpRHS, size_type *hash_begins_, + size_type *hash_nexts_, key_type *keys_, + value_type *values_) + : hash_begins(hash_begins_), + hash_nexts(hash_nexts_), + keys(keys_), + values(values_), + block_dim(block_dim_), + block_size(block_dim_ * block_dim_), + __max_value_size(max_value_size_), + __hashOpRHS(hashOpRHS) { + // Substract 1 and use the bitwiseAnd __compute_hash member. + if (std::is_same::value) { + __hashOpRHS -= 1; + } + } + + // Performs C[hash] += A * B (for existing entry) + // or C[hash] = A * B (for new entry) + // Insertion is sequential, no race condition for the insertion. + // the mergeadd used in the numeric of KKMEM. + KOKKOS_INLINE_FUNCTION + void sequential_insert_into_hash_mergeAdd_TrackHashes( + key_type key, const value_type *valueA, const value_type *valueB, + size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_index; + + if (key == -1) return; + + // issue-508, TODO: ensure that i < __max_value_size, but + // need information about length of keys, values, and hash_nexts first! + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + KokkosSparse::Impl::kk_block_add_mul(block_dim, values + i * block_size, + valueA, valueB); + return; + } + } + + my_index = (*used_size_)++; + + if (hash_begins[hash] == -1) { + used_hashes[used_hash_size[0]++] = hash; + } + hash_nexts[my_index] = hash_begins[hash]; + + hash_begins[hash] = my_index; + keys[my_index] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + my_index * block_size, valueA, valueB); + } + + // Performs C[hash] += A * B (for existing entry) + // or C[hash] = A * B (for new entry) + // Insertion is sequential, no race condition for the insertion. + // the mergeadd used in the numeric of KKMEM. + KOKKOS_INLINE_FUNCTION + void sequential_insert_into_hash_simple(key_type key, const value_type *a_val, + const value_type *b_val, + size_type &used_size, + size_type *used_hashes) { + for (size_type hash = (key * HASHSCALAR) & __hashOpRHS;; + hash = (hash + 1) & __hashOpRHS) { + if (keys[hash] == -1) { + used_hashes[used_size++] = hash; + keys[hash] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + hash * block_size, a_val, b_val); + break; + } else if (keys[hash] == key) { + KokkosSparse::Impl::kk_block_add_mul( + block_dim, values + hash * block_size, a_val, b_val); + break; + } + } + } + + KOKKOS_INLINE_FUNCTION + void sequential_export_values_simple(const size_type used_size, + const size_type *used_hashes, + key_type *out_keys, + value_type *out_values, + const bool clear = true) { + for (size_type i = 0; i < used_size; ++i) { + const auto hash = used_hashes[i]; + out_keys[i] = keys[hash]; + KokkosSparse::Impl::kk_block_set(block_dim, out_values + i * block_size, + values + hash * block_size); + if (clear) { + keys[hash] = -1; + } + } + } + + // used in the kkmem's numeric phase for second level hashmaps. + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAdd_TrackHashes( + const key_type key, const value_type *valA, const value_type *valB, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + if (hash != -1) { + i = hash_begins[hash]; + + for (; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + KokkosSparse::Impl::kk_block_add_mul( + block_dim, values + i * block_size, valA, valB); + return __insert_success; + } + } + } else { + return __insert_success; + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + my_write_index * block_size, valA, valB); + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + + // Neither the compiler nor the execution unit can re-order the line + // directly below with the next line performing the atomic_exchange as the + // atomic exchange writes to hash_begins[hash] and this line reads from + // hash_begins[hash]. + // This line is needed such that threads of execution can still access the + // old linked list, after hash_begins+hash has been atomically overwritten + // with my_write_index but before hash_nexts[my_write_index] is + // overwritten with hashbeginning. If this line was not here, threads may + // not be able to access the dangling linked list since + // hash_nexts[my_write_index] would still be -1. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + template + KOKKOS_INLINE_FUNCTION int + vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + const team_member_t & /* teamMember */, const int /* vector_size */, + size_type hash, const key_type key, const value_type *valA, + const value_type *valB, volatile size_type *used_size_, + const size_type max_value_size_) { + // Cannot compute hash here due to impl_speed use-case + // hash = __compute_hash(key, __hashOpRHS); + if (key == -1) return __insert_success; + + if (hash != -1) { + size_type i = hash_begins[hash]; + for (; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + KokkosSparse::Impl::kk_block_add_mul( + block_dim, values + i * block_size, valA, valB); + return __insert_success; + } + } + } else { + return __insert_success; + } + + // Ensure that threads don't continue incrementing used_size_ if the hashmap + // is full, used_size_ could overflow and result in undefined behavior. + if (used_size_[0] >= max_value_size_) { + return __insert_full; + } + size_type my_write_index = + Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= max_value_size_) { + return __insert_full; + } else { + keys[my_write_index] = key; + KokkosSparse::Impl::kk_block_set_mul( + block_dim, values + my_write_index * block_size, valA, valB); + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + + // Neither the compiler nor the execution unit can re-order the line + // directly below with the next line performing the atomic_exchange as the + // atomic exchange writes to hash_begins[hash] and this line reads from + // hash_begins[hash]. + // This line is needed such that threads of execution can still access the + // old linked list, after hash_begins+hash has been atomically overwritten + // with my_write_index but before hash_nexts[my_write_index] is + // overwritten with hashbeginning. If this line was not here, threads may + // not be able to access the dangling linked list since + // hash_nexts[my_write_index] would still be -1. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + // Atomically: + // hashbeginning = hash_begins[hash] + // hash_begins[hash] = my_write_index + // hash_nexts[my_write_index] = hash_begins[hash] + size_type hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // used in kkmem's numeric phase to insert to first level hashmaps. + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeAdd(const key_type key, + const value_type *valA, + const value_type *valB, + volatile size_type *used_size_) { + if (key == -1) return __insert_success; + + return vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + nullptr, 0, __compute_hash(key, __hashOpRHS), key, valA, valB, + used_size_, __max_value_size); + } + +#if 0 + // used in symbolic of kkmem if the compression is not applied. + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash(const key_type &key, + volatile size_type *used_size_) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeOr(const key_type &key, + const value_type &value, + volatile size_type *used_size_) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] | value; + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + // function to be called from device. + // Accumulation is Add operation. It is not atomicAdd, as this + // is for the cases where we know that none of the simultanous + // insertions will have the same key. + // Insertion is simulteanous for the vector lanes of a thread. + // used_size should be a shared pointer among the thread vectors + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_mergeOr_TrackHashes( + const key_type &key, const value_type &value, + volatile size_type *used_size_, size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + values[i] = values[i] | value; + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + values[my_write_index] = value; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } + + KOKKOS_INLINE_FUNCTION + int vector_atomic_insert_into_hash_TrackHashes(const key_type &key, + volatile size_type *used_size_, + size_type *used_hash_size, + size_type *used_hashes) { + size_type hash, i, my_write_index, hashbeginning; + + if (key == -1) return __insert_success; + + hash = __compute_hash(key, __hashOpRHS); + for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { + if (keys[i] == key) { + return __insert_success; + } + } + + my_write_index = Kokkos::atomic_fetch_add(used_size_, size_type(1)); + + if (my_write_index >= __max_value_size) { + return __insert_full; + } else { + keys[my_write_index] = key; + +#if defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_TURING75) || \ + defined(KOKKOS_ARCH_AMPERE) + // this is an issue on VOLTA and up because warps do not go in SIMD + // fashion anymore. while some thread might insert my_write_index into + // linked list, another thread in the warp might be reading keys in above + // loop. before inserting the new value in liked list -- which is done + // with atomic exchange below, we make sure that the linked is is complete + // my assigning the hash_next to current head. the head might be different + // when we do the atomic exchange. this would cause temporarily skipping a + // key in the linkedlist until hash_nexts is updated second time as below. + // but this is okay for spgemm, + // because no two keys will be inserted into hashmap at the same time, as + // rows have unique columns. + hash_nexts[my_write_index] = hash_begins[hash]; +#endif + + hashbeginning = + Kokkos::atomic_exchange(hash_begins + hash, my_write_index); + if (hashbeginning == -1) { + used_hashes[Kokkos::atomic_fetch_add(used_hash_size, size_type(1))] = + hash; + } + hash_nexts[my_write_index] = hashbeginning; + return __insert_success; + } + } +#endif + // end public members + private: + size_type __max_value_size; + size_type __hashOpRHS; + static constexpr int __insert_success = 0; + static constexpr int __insert_full = 1; + + template ::value || + std::is_same::value, + std::size_t>::type = 0> + KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type bitmask) { + size_type hash = key & bitmask; +#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED + if (hash == -1) Kokkos::abort("__compute_hash: hash = -1"); + if (key == -1) Kokkos::abort("__compute_hash: key = -1"); +#endif // HASHMAPACCUMULATOR_ASSERT_ENABLED + return hash; + } + + template ::value, + std::size_t>::type = 0> + KOKKOS_INLINE_FUNCTION int __compute_hash(size_type key, size_type divisor) { + size_type hash = key % divisor; +#ifdef HASHMAPACCUMULATOR_ASSERT_ENABLED + if (hash == -1) Kokkos::abort("__compute_hash: hash = -1"); + if (key == -1) Kokkos::abort("__compute_hash: key = -1"); +#endif // HASHMAPACCUMULATOR_ASSERT_ENABLED + return hash; + } + // private +}; // struct BlockHashmapAccumulator + +} // namespace Experimental +} // namespace KokkosKernels + +#endif // _KOKKOSKERNELS_HASHMAPACCUMULATOR_HPP diff --git a/src/common/KokkosKernels_BlockUtils.hpp b/src/common/KokkosKernels_BlockUtils.hpp new file mode 100644 index 0000000000..0c001ce115 --- /dev/null +++ b/src/common/KokkosKernels_BlockUtils.hpp @@ -0,0 +1,145 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSKERNELS_BLOCKUTILS_HPP +#define _KOKKOSKERNELS_BLOCKUTILS_HPP + +// #include +// #include +#include "KokkosBatched_Gemm_Serial_Internal.hpp" + +namespace KokkosSparse { +namespace Impl { + +// Initializes block: A = [val, val, val, ....] +template +KOKKOS_INLINE_FUNCTION void kk_block_init( + const size_type block_dim, value_type *dst, + const value_type val = static_cast( + 0)) { // Note: replaces __host__ std::fill() not to be called from GPU + for (auto end = dst + (block_dim * block_dim); dst < end; ++dst) { + *dst = val; + } +} + +// Initializes block: A = B +template +KOKKOS_INLINE_FUNCTION void kk_block_set(const size_type block_dim, + value_type *dst, + const value_type *val) { + memcpy(dst, val, block_dim * block_dim * sizeof(value_type)); +} + +// Performs A += B on blocks +template +KOKKOS_INLINE_FUNCTION void kk_block_add(const size_type block_dim, + value_type *dst, + const value_type *val) { + const auto end = dst + block_dim * block_dim; + while (dst < end) { + *(dst++) += *(val++); + } +} + +// Performs C += A * B on blocks +// Note: block is assumed to be row-major, dense matrix (no extra padding) +// Note: set clear=true to set C = 0 before increment +template > +KOKKOS_INLINE_FUNCTION void kk_block_dgemm(const size_type block_dim, + value_type *dst, + const value_type *valA, + const value_type *valB, + const bool clear = false) { + const auto ZERO = static_cast(0); + const auto ONE = static_cast(1); + DGEMM::invoke(block_dim, block_dim, block_dim, ONE, valA, block_dim, 1, valB, + block_dim, 1, clear ? ZERO : ONE, dst, block_dim, 1); +} + +// dgemm: C = A * B +template +KOKKOS_INLINE_FUNCTION void kk_block_set_mul(const size_type block_dim, + value_type *c_val, + const value_type *a_val, + const value_type *b_val) { + kk_block_dgemm(block_dim, c_val, a_val, b_val, true); +} + +// dgemm: C += A * B +template +KOKKOS_INLINE_FUNCTION void kk_block_add_mul(const size_type block_dim, + value_type *c_val, + const value_type *a_val, + const value_type *b_val) { + kk_block_dgemm(block_dim, c_val, a_val, b_val, false); +} + +// Performs C += A * B (dense GEMM) on blocks +// Note: all pointers reference dense row-major blocks (no extra padding) +template +KOKKOS_INLINE_FUNCTION void kk_vector_block_add_mul(const size_type block_dim, + value_type *dst, + const value_type *valA, + const value_type *valB) { + // NOTE: this should be replaced by batched DGEMM + // once atomic increment is supported there + for (size_type row = 0; row < block_dim; ++row) { + auto const row_offset = row * block_dim; + for (size_type col = 0; col < block_dim; ++col) { + auto v = &dst[row_offset + col]; + auto vb = valB + col; + for (const value_type *va = valA + row_offset, *end = va + block_dim; + va < end; ++va) { + Kokkos::atomic_add(v, (*va) * (*vb)); + vb += block_dim; + } + } + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // _KOKKOSKERNELS_BLOCKUTILS_HPP diff --git a/src/common/KokkosKernels_Error.hpp b/src/common/KokkosKernels_Error.hpp index b2f41fd4f6..11bd7f6953 100644 --- a/src/common/KokkosKernels_Error.hpp +++ b/src/common/KokkosKernels_Error.hpp @@ -54,6 +54,30 @@ inline void throw_runtime_exception(const std::string &msg) { throw std::runtime_error(msg); } +#if defined(KOKKOS_ENABLE_HIP) +inline void hip_internal_error_throw(hipError_t e, const char *name, + const char *file, const int line) { + std::ostringstream out; + out << name << " error( " << hipGetErrorName(e) + << "): " << hipGetErrorString(e); + if (file) { + out << " " << file << ":" << line; + } + throw_runtime_exception(out.str()); +} + +inline void hip_internal_safe_call(hipError_t e, const char *name, + const char *file = nullptr, + const int line = 0) { + if (hipSuccess != e) { + hip_internal_error_throw(e, name, file, line); + } +} + +#define KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(call) \ + hip_internal_safe_call(call, #call, __FILE__, __LINE__) +#endif + } // namespace Impl } // namespace KokkosKernels diff --git a/src/common/KokkosKernels_ExecSpaceUtils.hpp b/src/common/KokkosKernels_ExecSpaceUtils.hpp index 444d787963..41e750e93e 100644 --- a/src/common/KokkosKernels_ExecSpaceUtils.hpp +++ b/src/common/KokkosKernels_ExecSpaceUtils.hpp @@ -42,16 +42,17 @@ //@HEADER */ +#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP +#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP + #include "Kokkos_Core.hpp" +#include "KokkosKernels_Error.hpp" #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) #include #include #endif -#ifndef _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP -#define _KOKKOSKERNELSUTILSEXECSPACEUTILS_HPP - namespace KokkosKernels { namespace Impl { @@ -64,6 +65,7 @@ enum ExecSpaceType { Exec_HIP, Exec_SYCL }; + template KOKKOS_FORCEINLINE_FUNCTION ExecSpaceType kk_get_exec_space_type() { ExecSpaceType exec_space = Exec_SERIAL; @@ -205,7 +207,7 @@ inline void kk_get_free_total_memory( template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - hipMemGetInfo(&free_mem, &total_mem); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); } #endif @@ -368,12 +370,12 @@ template <> struct SpaceInstance { static Kokkos::Experimental::HIP create() { hipStream_t stream; - hipStreamCreate(&stream); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); return Kokkos::Experimental::HIP(stream); } static void destroy(Kokkos::Experimental::HIP& space) { hipStream_t stream = space.hip_stream(); - hipStreamDestroy(stream); + KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); } static bool overlap() { // TODO: does HIP have an equivalent for CUDA_LAUNCH_BLOCKING? diff --git a/src/KokkosKernels_Half.hpp b/src/common/KokkosKernels_Half.hpp similarity index 100% rename from src/KokkosKernels_Half.hpp rename to src/common/KokkosKernels_Half.hpp diff --git a/src/common/KokkosKernels_HashmapAccumulator.hpp b/src/common/KokkosKernels_HashmapAccumulator.hpp index b7f39f75c2..c6397fd9ea 100644 --- a/src/common/KokkosKernels_HashmapAccumulator.hpp +++ b/src/common/KokkosKernels_HashmapAccumulator.hpp @@ -344,12 +344,12 @@ struct HashmapAccumulator { // Insertion is sequential, no race condition for the insertion. // the mergeadd used in the numeric of KKMEM. KOKKOS_INLINE_FUNCTION - int sequential_insert_into_hash_mergeAdd_TrackHashes( + void sequential_insert_into_hash_mergeAdd_TrackHashes( key_type key, value_type value, size_type *used_size_, size_type *used_hash_size, size_type *used_hashes) { size_type hash, i, my_index; - if (key == -1) return __insert_success; + if (key == -1) return; // issue-508, TODO: ensure that i < __max_value_size, but // need information about length of keys, values, and hash_nexts first! @@ -357,7 +357,7 @@ struct HashmapAccumulator { for (i = hash_begins[hash]; i != -1; i = hash_nexts[i]) { if (keys[i] == key) { values[i] = values[i] + value; - return __insert_success; + return; } } @@ -371,7 +371,6 @@ struct HashmapAccumulator { hash_begins[hash] = my_index; keys[my_index] = key; values[my_index] = value; - return __insert_success; } // no values. simply adds to the keys. diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index bf1f3b4bfc..08e6f3cdc7 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -88,363 +88,6 @@ inline void getRandomBounds(double mag, Kokkos::complex &start, end = Kokkos::complex(mag, mag); } -// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp -// file. -template -void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, - SizeType &nnz, OrdinalType row_size_variance, - OrdinalType bandwidth, ScalarType *&values, - SizeType *&rowPtr, OrdinalType *&colInd) { - rowPtr = new SizeType[nrows + 1]; - - OrdinalType elements_per_row = nrows ? nnz / nrows : 0; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - int numRowEntries = elements_per_row + varianz; - if (numRowEntries < 0) numRowEntries = 0; - // Clamping numRowEntries above accomplishes 2 things: - // - If ncols is 0, numRowEntries will also be 0 - // - With numRowEntries at most 2/3 the number of columns, in the worst - // case - // 90% of insertions will succeed after 6 tries - if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols; - rowPtr[row + 1] = rowPtr[row] + numRowEntries; - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) { - while (true) { - OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; - while (pos < 0) pos += ncols; - while (pos >= ncols) pos -= ncols; - - bool is_already_in_the_row = false; - for (SizeType j = rowPtr[row]; j < k; j++) { - if (colInd[j] == pos) { - is_already_in_the_row = true; - break; - } - } - if (!is_already_in_the_row) { - colInd[k] = pos; - break; - } - } - } - } - // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 - // + 50i) for complex types. - Kokkos::View valuesView(values, nnz); - ScalarType randStart, randEnd; - getRandomBounds(50.0, randStart, randEnd); - Kokkos::Random_XorShift64_Pool pool(13718); - Kokkos::fill_random(valuesView, pool, randStart, randEnd); -} - -template -void kk_sparseMatrix_generate_lower_upper_triangle( - char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz, - OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/, - ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) { - rowPtr = new SizeType[nrows + 1]; - - // OrdinalType elements_per_row = nnz/nrows; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - if (uplo == 'L') - rowPtr[row + 1] = rowPtr[row] + row + 1; - else - rowPtr[row + 1] = rowPtr[row] + ncols - (row); - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) { - if (uplo == 'L') - colInd[k] = k - rowPtr[row]; - else - colInd[k] = row + (k - rowPtr[row]); - values[k] = 1.0; - } - } -} - -template -void kk_diagonally_dominant_sparseMatrix_generate( - OrdinalType nrows, OrdinalType ncols, SizeType &nnz, - OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, - SizeType *&rowPtr, OrdinalType *&colInd, - ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { - rowPtr = new SizeType[nrows + 1]; - - OrdinalType elements_per_row = nnz / nrows; - srand(13721); - rowPtr[0] = 0; - for (int row = 0; row < nrows; row++) { - int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; - if (varianz < 1) varianz = 1; - if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; - rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; - if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is - rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row - } - nnz = rowPtr[nrows]; - values = new ScalarType[nnz]; - colInd = new OrdinalType[nnz]; - for (OrdinalType row = 0; row < nrows; row++) { - ScalarType total_values = 0; - std::unordered_set entriesInRow; - // We always add the diagonal entry (after this loop) - entriesInRow.insert(row); - for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { - while (true) { - OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; - while (pos < 0) pos += ncols; - while (pos >= ncols) pos -= ncols; - - if (entriesInRow.find(pos) == entriesInRow.end()) { - entriesInRow.insert(pos); - colInd[k] = pos; - values[k] = 100.0 * rand() / RAND_MAX - 50.0; - total_values += - Kokkos::Details::ArithTraits::abs(values[k]); - break; - } - } - } - - colInd[rowPtr[row + 1] - 1] = row; - values[rowPtr[row + 1] - 1] = total_values * diagDominance; - } -} - -// This function creates a diagonal sparse matrix for testing matrix operations. -// The elements on the diagonal are 1, 2, ..., n-1, n. -// If "invert" is true, it will return the inverse of the above diagonal matrix. -template -crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n, - const bool invert = false) { - typedef typename crsMat_t::ordinal_type ot; - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - - row_map_view_t rowmap_view("rowmap_view", n + 1); - cols_view_t columns_view("colsmap_view", n); - values_view_t values_view("values_view", n); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= n; ++i) { - hr(i) = size_type(i); - } - - for (ot i = 0; i < n; ++i) { - hc(i) = lno_t(i); - if (invert) { - hv(i) = scalar_t(1.0) / (scalar_t(i + 1)); - } else { - hv(i) = scalar_t(i + 1); - } - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", n, values_view, static_graph); - return crsmat; -} - -template -crsMat_t kk_generate_diagonally_dominant_sparse_matrix( - typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth, - typename crsMat_t::const_value_type diagDominance = - 10 * Kokkos::ArithTraits::one()) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_diagonally_dominant_sparseMatrix_generate( - nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj, - diagDominance); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsMat_t kk_generate_triangular_sparse_matrix( - char uplo, typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_sparseMatrix_generate_lower_upper_triangle( - uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - Kokkos::fence(); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsMat_t kk_generate_sparse_matrix( - typename crsMat_t::const_ordinal_type nrows, - typename crsMat_t::const_ordinal_type ncols, - typename crsMat_t::non_const_size_type &nnz, - typename crsMat_t::const_ordinal_type row_size_variance, - typename crsMat_t::const_ordinal_type bandwidth) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::non_const_value_type size_type; - typedef typename cols_view_t::non_const_value_type lno_t; - typedef typename values_view_t::non_const_value_type scalar_t; - lno_t *adj; - size_type *xadj; //, nnzA; - scalar_t *values; - - kk_sparseMatrix_generate( - nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); - - row_map_view_t rowmap_view("rowmap_view", nrows + 1); - cols_view_t columns_view("colsmap_view", nnz); - values_view_t values_view("values_view", nnz); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - typename values_view_t::HostMirror hv = - Kokkos::create_mirror_view(values_view); - - for (lno_t i = 0; i <= nrows; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnz; ++i) { - hc(i) = adj[i]; - hv(i) = values[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -// TODO: need to fix the size_type. All over the reading inputs are lno_t. - template void md_malloc(stype **arr, size_t n, std::string /*alloc_str*/ = "") { *arr = new stype[n]; @@ -550,133 +193,85 @@ inline void kk_read_1Dview_from_file(idx_array_type &view, Kokkos::fence(); } -template -void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, - idx *lower_triangle_srcs, - idx *lower_triangle_dests) { - idx ind = 0; - for (idx i = 0; i < nv; ++i) { - idx xb = xadj[i]; - idx xe = xadj[i + 1]; - for (idx j = xb; j < xe; ++j) { - idx dst = adj[j]; - if (i < dst) { - lower_triangle_srcs[ind] = i; - lower_triangle_dests[ind++] = dst; - } - } - } -} - -template -void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) { - for (idx i = 0; i < nv; ++i) { - idx xb = xadj[i]; - idx xe = xadj[i + 1]; - for (idx j = xb; j < xe; ++j) { - srcs[j] = i; +template +inline void kk_write_2Dview_to_file(idx_array_type view, const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + Kokkos::deep_copy(host_view, view); + Kokkos::fence(); + std::ofstream myFile(filename, std::ios::out); + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + myFile << host_view(i, j) << " "; } + myFile << std::endl; } + myFile.close(); } -template -void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests, - wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) { - std::vector> edges(ne); - for (size_type i = 0; i < ne; ++i) { - edges[i].src = srcs[i]; - edges[i].dst = dests[i]; - edges[i].ew = ew[i]; - } - std::sort(edges.begin(), edges.begin() + ne); +template +inline void kk_read_2Dview_from_file(idx_array_type &view, + const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + std::ifstream myFile(filename, std::ios::in); - size_type eind = 0; - for (lno_t i = 0; i < nv; ++i) { - (xadj)[i] = eind; - while (edges[eind].src == i) { - (adj)[eind] = edges[eind].dst; - (*crs_ew)[eind] = edges[eind].ew; - ++eind; + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + myFile >> host_view(i, j); } } - xadj[nv] = eind; + myFile.close(); + Kokkos::deep_copy(view, host_view); + Kokkos::fence(); } -template -void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs, - in_lno_t *dests, size_type *xadj, - lno_t *adj) { - std::vector> edges(ne * 2); - for (size_type i = 0; i < ne; ++i) { - edges[i * 2].src = srcs[i]; - edges[i * 2].dst = dests[i]; - - edges[i * 2 + 1].src = dests[i]; - edges[i * 2 + 1].dst = srcs[i]; - } -#ifdef KOKKOSKERNELS_HAVE_OUTER -#include -#include -#include -#include - __gnu_parallel::parallel_sort_mwms *>( - &(edges[0]), &(edges[0]) + ne * 2, - std::less>(), 64); -#else - std::sort(edges.begin(), edges.begin() + ne * 2); -#endif - - size_type eind = 0; - for (lno_t i = 0; i < nv; ++i) { - (xadj)[i] = eind; - while (edges[eind].src == i) { - (adj)[eind] = edges[eind].dst; - //(*crs_ew)[eind] = edges[eind].ew; - ++eind; +template +inline void kk_write_3Dview_to_file(idx_array_type view, const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + Kokkos::deep_copy(host_view, view); + Kokkos::fence(); + std::ofstream myFile(filename, std::ios::out); + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + for (size_t k = 0; k < view.extent(2); ++k) { + myFile << host_view(i, j, k) << " "; + } + myFile << std::endl; } + myFile << std::endl; } - xadj[nv] = eind; + myFile.close(); } -/* - -template -void read_graph_src_dst_bin( - lno_t *nv, size_type *ne - ,size_type **xadj, lno_t **adj, scalar_t **ew, - const char *fnameSrc, const char *fnameTarg){ - size_t numEdges = 0; - size_t *srcs, *dst; //this type is hard coded - buildEdgeListFromBinSrcTarg_undirected( - fnameSrc, fnameTarg, - &numEdges, - &srcs, &dst); +template +inline void kk_read_3Dview_from_file(idx_array_type &view, + const char *filename) { + typedef typename idx_array_type::HostMirror host_type; + // typedef typename idx_array_type::size_type idx; + host_type host_view = Kokkos::create_mirror_view(view); + std::ifstream myFile(filename, std::ios::in); - lno_t num_vertex = 0; - for (size_t i = 0; i < numEdges; ++i){ - if (num_vertex < srcs[i]) num_vertex = srcs[i]; - if (num_vertex < dst[i]) num_vertex = dst[i]; + for (size_t i = 0; i < view.extent(0); ++i) { + for (size_t j = 0; j < view.extent(1); ++j) { + for (size_t k = 0; k < view.extent(2); ++k) { + myFile >> host_view(i, j, k); + } + } } - num_vertex += 1; - - *nv = num_vertex; - *ne = numEdges * 2; - - md_malloc(xadj, num_vertex + 1); - md_malloc(adj, numEdges * 2); - convert_undirected_edge_list_to_csr ( - num_vertex, numEdges, - srcs, dst, - *xadj, *adj); - - delete [] srcs; - delete [] dst; + myFile.close(); + Kokkos::deep_copy(view, host_view); + Kokkos::fence(); } -*/ template -void write_edgelist_bin(size_t ne, const idx *edge_begins, const idx *edge_ends, - const wt *ew, const char *filename) { +[[deprecated]] void write_edgelist_bin(size_t ne, const idx *edge_begins, + const idx *edge_ends, const wt *ew, + const char *filename) { std::ofstream myFile(filename, std::ios::out | std::ios::binary); myFile.write((char *)&ne, sizeof(idx)); myFile.write((char *)edge_begins, sizeof(idx) * (ne)); @@ -700,270 +295,6 @@ void read_edgelist_bin(idx *ne, idx **edge_begins, idx **edge_ends, wt **ew, myFile.close(); } -template -void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename, std::ios::out | std::ios::binary); - myFile.write((char *)&nv, sizeof(lno_t)); - myFile.write((char *)&ne, sizeof(size_type)); - myFile.write((char *)xadj, sizeof(size_type) * (nv + 1)); - - myFile.write((char *)adj, sizeof(lno_t) * (ne)); - - myFile.write((char *)ew, sizeof(scalar_t) * (ne)); - - myFile.close(); -} - -template -void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename, std::ios::out); - myFile << nv << " " << ne << std::endl; - - for (lno_t i = 0; i <= nv; ++i) { - myFile << xadj[i] << " "; - } - myFile << std::endl; - - for (lno_t i = 0; i < nv; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << adj[j] << " "; - } - myFile << std::endl; - } - for (size_type i = 0; i < ne; ++i) { - myFile << ew[i] << " "; - } - myFile << std::endl; - - myFile.close(); -} - -template -void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t * /*ew*/, - const char *filename) { - std::ofstream ff(filename); - ff << "AdjacencyGraph" << std::endl; - ff << nv << std::endl << ne << std::endl; - for (lno_t i = 0; i < nv; ++i) { - ff << xadj[i] << std::endl; - } - for (size_type i = 0; i < ne; ++i) { - ff << adj[i] << std::endl; - } - ff.close(); -} - -// MM: types and utility functions for parsing the MatrixMarket format -namespace MM { -enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR }; -enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY }; -enum MtxField { - UNDEFINED_FIELD, - REAL, // includes both float and double - COMPLEX, // includes complex and complex - INTEGER, // includes all integer types - PATTERN // not a type, but means the value for every entry is 1 -}; -enum MtxSym { - UNDEFINED_SYMMETRY, - GENERAL, - SYMMETRIC, // A(i, j) = A(j, i) - SKEW_SYMMETRIC, // A(i, j) = -A(j, i) - HERMITIAN // A(i, j) = a + bi; A(j, i) = a - bi -}; - -// readScalar/writeScalar: read and write a scalar in the form that it appears -// in an .mtx file. The >> and << operators won't work, because complex appears -// as "real imag", not "(real, imag)" -template -scalar_t readScalar(std::istream &is) { - scalar_t val; - is >> val; - return val; -} - -template <> -inline Kokkos::complex readScalar(std::istream &is) { - float r, i; - is >> r; - is >> i; - return Kokkos::complex(r, i); -} - -template <> -inline Kokkos::complex readScalar(std::istream &is) { - double r, i; - is >> r; - is >> i; - return Kokkos::complex(r, i); -} - -template -void writeScalar(std::ostream &os, scalar_t val) { - os << val; -} - -template <> -inline void writeScalar(std::ostream &os, Kokkos::complex val) { - os << val.real() << ' ' << val.imag(); -} - -template <> -inline void writeScalar(std::ostream &os, Kokkos::complex val) { - os << val.real() << ' ' << val.imag(); -} - -// symmetryFlip: given a value for A(i, j), return the value that -// should be inserted at A(j, i) (if any) -template -scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) { - if (symFlag == SKEW_SYMMETRIC) return -val; - return val; -} - -template <> -inline Kokkos::complex symmetryFlip(Kokkos::complex val, - MtxSym symFlag) { - if (symFlag == HERMITIAN) - return Kokkos::conj(val); - else if (symFlag == SKEW_SYMMETRIC) - return -val; - return val; -} - -template <> -inline Kokkos::complex symmetryFlip(Kokkos::complex val, - MtxSym symFlag) { - if (symFlag == HERMITIAN) - return Kokkos::conj(val); - else if (symFlag == SKEW_SYMMETRIC) - return -val; - return val; -} -} // namespace MM - -template -void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, - const size_type *xadj, const lno_t *adj, - const scalar_t *vals, const char *filename) { - std::ofstream myFile(filename); - myFile << "%%MatrixMarket matrix coordinate "; - if (std::is_same>::value || - std::is_same>::value) - myFile << "complex"; - else - myFile << "real"; - myFile << " general\n"; - myFile << nrows << " " << ncols << " " << nentries << '\n'; - myFile << std::setprecision(17) << std::scientific; - for (lno_t i = 0; i < nrows; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << i + 1 << " " << adj[j] + 1 << " "; - MM::writeScalar(myFile, vals[j]); - myFile << '\n'; - } - } - myFile.close(); -} - -template -void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj, - const lno_t *adj, const scalar_t *ew, - const char *filename) { - std::ofstream myFile(filename); - myFile << "%%MatrixMarket matrix coordinate "; - if (std::is_same>::value || - std::is_same>::value) - myFile << "complex"; - else - myFile << "real"; - myFile << " general\n"; - myFile << nv << " " << nv << " " << ne << '\n'; - myFile << std::setprecision(8) << std::scientific; - for (lno_t i = 0; i < nv; ++i) { - size_type b = xadj[i]; - size_type e = xadj[i + 1]; - for (size_type j = b; j < e; ++j) { - myFile << i + 1 << " " << (adj)[j] + 1 << " "; - MM::writeScalar(myFile, ew[j]); - myFile << '\n'; - } - } - - myFile.close(); -} - -template -void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::ifstream myFile(filename, std::ios::in | std::ios::binary); - - myFile.read((char *)nv, sizeof(lno_t)); - myFile.read((char *)ne, sizeof(size_type)); - md_malloc(xadj, *nv + 1); - md_malloc(adj, *ne); - md_malloc(ew, *ne); - myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1)); - myFile.read((char *)*adj, sizeof(lno_t) * (*ne)); - myFile.read((char *)*ew, sizeof(scalar_t) * (*ne)); - myFile.close(); -} - -// When Kokkos issue #2313 is resolved, can delete -// parseScalar and just use operator>> -template -scalar_t parseScalar(std::istream &is) { - scalar_t val; - is >> val; - return val; -} - -template <> -inline Kokkos::complex parseScalar(std::istream &is) { - std::complex val; - is >> val; - return Kokkos::complex(val); -} - -template <> -inline Kokkos::complex parseScalar(std::istream &is) { - std::complex val; - is >> val; - return Kokkos::complex(val); -} - -template -void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::ifstream myFile(filename, std::ios::in); - myFile >> *nv >> *ne; - - md_malloc(xadj, *nv + 1); - md_malloc(adj, *ne); - md_malloc(ew, *ne); - - for (lno_t i = 0; i <= *nv; ++i) { - myFile >> (*xadj)[i]; - } - - for (size_type i = 0; i < *ne; ++i) { - myFile >> (*adj)[i]; - } - for (size_type i = 0; i < *ne; ++i) { - (*ew)[i] = parseScalar(myFile); - } - myFile.close(); -} - inline bool endswith(std::string const &fullString, std::string const &ending) { if (fullString.length() >= ending.length()) { return (0 == fullString.compare(fullString.length() - ending.length(), @@ -973,491 +304,6 @@ inline bool endswith(std::string const &fullString, std::string const &ending) { } } -template -void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) { - typedef typename crs_matrix_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crs_matrix_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::value_type offset_t; - typedef typename cols_view_t::value_type lno_t; - typedef typename values_view_t::value_type scalar_t; - typedef typename values_view_t::size_type size_type; - - size_type nnz = a_crsmat.nnz(); - - auto a_rowmap_view = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), a_crsmat.graph.row_map); - auto a_entries_view = Kokkos::create_mirror_view_and_copy( - Kokkos::HostSpace(), a_crsmat.graph.entries); - auto a_values_view = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values); - offset_t *a_rowmap = const_cast(a_rowmap_view.data()); - lno_t *a_entries = a_entries_view.data(); - scalar_t *a_values = a_values_view.data(); - - std::string strfilename(filename); - if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) { - write_matrix_mtx( - a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap, - a_entries, a_values, filename); - return; - } else if (a_crsmat.numRows() != a_crsmat.numCols()) { - throw std::runtime_error( - "For formats other than MatrixMarket (suffix .mm or .mtx),\n" - "write_kokkos_crst_matrix only supports square matrices"); - } - if (endswith(strfilename, ".bin")) { - write_graph_bin( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else if (endswith(strfilename, ".ligra")) { - write_graph_ligra( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else if (endswith(strfilename, ".crs")) { - write_graph_crs( - a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); - } else { - std::string errMsg = - std::string("write_kokkos_crst_matrix: File extension on ") + filename + - " does not correspond to a known format"; - throw std::runtime_error(errMsg); - } -} - -template -int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, - size_type **xadj, lno_t **adj, scalar_t **ew, - bool symmetrize = false, bool remove_diagonal = true, - bool transpose = false) { - using namespace MM; - std::ifstream mmf(fileName, std::ifstream::in); - if (!mmf.is_open()) { - throw std::runtime_error("File cannot be opened\n"); - } - - std::string fline = ""; - getline(mmf, fline); - - if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') { - throw std::runtime_error("Invalid MM file. Line-1\n"); - } - - // make sure every required field is in the file, by initializing them to - // UNDEFINED_* - MtxObject mtx_object = UNDEFINED_OBJECT; - MtxFormat mtx_format = UNDEFINED_FORMAT; - MtxField mtx_field = UNDEFINED_FIELD; - MtxSym mtx_sym = UNDEFINED_SYMMETRY; - - if (fline.find("matrix") != std::string::npos) { - mtx_object = MATRIX; - } else if (fline.find("vector") != std::string::npos) { - mtx_object = VECTOR; - throw std::runtime_error( - "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()"); - } - - if (fline.find("coordinate") != std::string::npos) { - // sparse - mtx_format = COORDINATE; - } else if (fline.find("array") != std::string::npos) { - // dense - mtx_format = ARRAY; - } - - if (fline.find("real") != std::string::npos || - fline.find("double") != std::string::npos) { - if (std::is_same::value || - std::is_same::value) - mtx_field = REAL; - else { - if (!std::is_floating_point::value) - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with float or double typed " - "MatrixMarket file."); - else - mtx_field = REAL; - } - } else if (fline.find("complex") != std::string::npos) { - if (!(std::is_same>::value || - std::is_same>::value)) - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket " - "file."); - else - mtx_field = COMPLEX; - } else if (fline.find("integer") != std::string::npos) { - if (std::is_integral::value || - std::is_floating_point::value || - std::is_same::value || - std::is_same::value) - mtx_field = INTEGER; - else - throw std::runtime_error( - "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket " - "file."); - } else if (fline.find("pattern") != std::string::npos) { - mtx_field = PATTERN; - // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so - // nothing to check here - } - - if (fline.find("general") != std::string::npos) { - mtx_sym = GENERAL; - } else if (fline.find("skew-symmetric") != std::string::npos) { - mtx_sym = SKEW_SYMMETRIC; - } else if (fline.find("symmetric") != std::string::npos) { - // checking for "symmetric" after "skew-symmetric" because it's a substring - mtx_sym = SYMMETRIC; - } else if (fline.find("hermitian") != std::string::npos || - fline.find("Hermitian") != std::string::npos) { - mtx_sym = HERMITIAN; - } - // Validate the matrix attributes - if (mtx_format == ARRAY) { - if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL; - if (mtx_sym != GENERAL) - throw std::runtime_error( - "array format MatrixMarket file must have general symmetry (optional " - "to include \"general\")"); - } - if (mtx_object == UNDEFINED_OBJECT) - throw std::runtime_error( - "MatrixMarket file header is missing the object type."); - if (mtx_format == UNDEFINED_FORMAT) - throw std::runtime_error("MatrixMarket file header is missing the format."); - if (mtx_field == UNDEFINED_FIELD) - throw std::runtime_error( - "MatrixMarket file header is missing the field type."); - if (mtx_sym == UNDEFINED_SYMMETRY) - throw std::runtime_error( - "MatrixMarket file header is missing the symmetry type."); - - while (1) { - getline(mmf, fline); - if (fline[0] != '%') break; - } - std::stringstream ss(fline); - lno_t nr = 0, nc = 0; - size_type nnz = 0; - ss >> nr >> nc; - if (mtx_format == COORDINATE) - ss >> nnz; - else - nnz = nr * nc; - size_type numEdges = nnz; - symmetrize = symmetrize || mtx_sym != GENERAL; - if (symmetrize && nr != nc) { - throw std::runtime_error("A non-square matrix cannot be symmetrized."); - } - if (mtx_format == ARRAY) { - // Array format only supports general symmetry and non-pattern - if (symmetrize) - throw std::runtime_error( - "array format MatrixMarket file cannot be symmetrized."); - if (mtx_field == PATTERN) - throw std::runtime_error( - "array format MatrixMarket file can't have \"pattern\" field type."); - } - if (symmetrize) { - numEdges = 2 * nnz; - } - // numEdges is only an upper bound (diagonal entries may be removed) - std::vector> edges(numEdges); - size_type nE = 0; - lno_t numDiagonal = 0; - for (size_type i = 0; i < nnz; ++i) { - getline(mmf, fline); - std::stringstream ss2(fline); - struct Edge tmp; - // read source, dest (edge) and weight (value) - lno_t s, d; - scalar_t w; - if (mtx_format == ARRAY) { - // In array format, entries are listed in column major order, - // so the row and column can be determined just from the index i - //(but make them 1-based indices, to match the way coordinate works) - s = i % nr + 1; // row - d = i / nr + 1; // col - } else { - // In coordinate format, row and col of each entry is read from file - ss2 >> s >> d; - } - if (mtx_field == PATTERN) - w = 1; - else - w = readScalar(ss2); - if (!transpose) { - tmp.src = s - 1; - tmp.dst = d - 1; - tmp.ew = w; - } else { - tmp.src = d - 1; - tmp.dst = s - 1; - tmp.ew = w; - } - if (tmp.src == tmp.dst) { - numDiagonal++; - if (!remove_diagonal) { - edges[nE++] = tmp; - } - continue; - } - edges[nE++] = tmp; - if (symmetrize) { - struct Edge tmp2; - tmp2.src = tmp.dst; - tmp2.dst = tmp.src; - // the symmetrized value is w, -w or conj(w) if mtx_sym is - // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively. - tmp2.ew = symmetryFlip(tmp.ew, mtx_sym); - edges[nE++] = tmp2; - } - } - mmf.close(); - std::sort(edges.begin(), edges.begin() + nE); - if (transpose) { - lno_t tmp = nr; - nr = nc; - nc = tmp; - } - // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt - *nrows = nr; - *ncols = nc; - *ne = nE; - //*xadj = new idx[nr + 1]; - md_malloc(xadj, nr + 1); - //*adj = new idx[nE]; - md_malloc(adj, nE); - //*ew = new wt[nE]; - md_malloc(ew, nE); - size_type eind = 0; - size_type actual = 0; - for (lno_t i = 0; i < nr; ++i) { - (*xadj)[i] = actual; - bool is_first = true; - while (eind < nE && edges[eind].src == i) { - if (is_first || !symmetrize || eind == 0 || - (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) { - (*adj)[actual] = edges[eind].dst; - (*ew)[actual] = edges[eind].ew; - ++actual; - } - is_first = false; - ++eind; - } - } - (*xadj)[nr] = actual; - *ne = actual; - return 0; -} - -// Version of read_mtx which does not capture the number of columns. -// This is the old interface; it's kept for backwards compatibility. -template -int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj, - lno_t **adj, scalar_t **ew, bool symmetrize = false, - bool remove_diagonal = true, bool transpose = false) { - lno_t ncol; // will discard - return read_mtx(fileName, nv, &ncol, ne, xadj, - adj, ew, symmetrize, - remove_diagonal, transpose); -} - -template -void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, - scalar_t **ew, const char *filename) { - std::string strfilename(filename); - if (endswith(strfilename, ".mtx") || endswith(strfilename, ".mm")) { - read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); - } - - else if (endswith(strfilename, ".bin")) { - read_graph_bin(nv, ne, xadj, adj, ew, filename); - } - - else if (endswith(strfilename, ".crs")) { - read_graph_crs(nv, ne, xadj, adj, ew, filename); - } - - else { - throw std::runtime_error("Reader is not available\n"); - } -} - -template -crsMat_t read_kokkos_crst_matrix(const char *filename_) { - std::string strfilename(filename_); - bool isMatrixMarket = - endswith(strfilename, ".mtx") || endswith(strfilename, ".mm"); - - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type row_map_view_t; - typedef typename graph_t::entries_type::non_const_type cols_view_t; - typedef typename crsMat_t::values_type::non_const_type values_view_t; - - typedef typename row_map_view_t::value_type size_type; - typedef typename cols_view_t::value_type lno_t; - typedef typename values_view_t::value_type scalar_t; - - lno_t nr, nc, *adj; - size_type *xadj, nnzA; - scalar_t *values; - - if (isMatrixMarket) { - // MatrixMarket file contains the exact number of columns - read_mtx(filename_, &nr, &nc, &nnzA, &xadj, - &adj, &values, false, false, false); - } else { - //.crs and .bin files don't contain #cols, so will compute it later based on - // the entries - read_matrix(&nr, &nnzA, &xadj, &adj, &values, - filename_); - } - - row_map_view_t rowmap_view("rowmap_view", nr + 1); - cols_view_t columns_view("colsmap_view", nnzA); - values_view_t values_view("values_view", nnzA); - - { - Kokkos::View> - hr(xadj, nr + 1); - Kokkos::View> - hc(adj, nnzA); - Kokkos::View> - hv(values, nnzA); - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - Kokkos::deep_copy(values_view, hv); - } - - if (!isMatrixMarket) { - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, nc); - nc++; - } - - graph_t static_graph(columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph); - delete[] xadj; - delete[] adj; - delete[] values; - return crsmat; -} - -template -crsGraph_t read_kokkos_crst_graph(const char *filename_) { - typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t; - typedef typename crsGraph_t::entries_type::non_const_type cols_view_t; - - typedef typename row_map_view_t::value_type size_type; - typedef typename cols_view_t::value_type lno_t; - typedef double scalar_t; - - lno_t nv, *adj; - size_type *xadj, nnzA; - scalar_t *values; - read_matrix(&nv, &nnzA, &xadj, &adj, &values, - filename_); - - row_map_view_t rowmap_view("rowmap_view", nv + 1); - cols_view_t columns_view("colsmap_view", nnzA); - - { - typename row_map_view_t::HostMirror hr = - Kokkos::create_mirror_view(rowmap_view); - typename cols_view_t::HostMirror hc = - Kokkos::create_mirror_view(columns_view); - - for (lno_t i = 0; i <= nv; ++i) { - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnzA; ++i) { - hc(i) = adj[i]; - } - Kokkos::deep_copy(rowmap_view, hr); - Kokkos::deep_copy(columns_view, hc); - } - - lno_t ncols = 0; - KokkosKernels::Impl::kk_view_reduce_max( - nnzA, columns_view, ncols); - ncols += 1; - - crsGraph_t static_graph(columns_view, rowmap_view, ncols); - delete[] xadj; - delete[] adj; - delete[] values; - return static_graph; -} - -template -inline void kk_sequential_create_incidence_matrix( - nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj, - size_type *i_adj // output. preallocated -) { - std::vector c_xadj(num_rows); - for (nnz_lno_t i = 0; i < num_rows; i++) { - c_xadj[i] = xadj[i]; - } - int eCnt = 0; - for (nnz_lno_t i = 0; i < num_rows; i++) { - size_type begin = xadj[i]; - size_type end = xadj[i + 1]; - nnz_lno_t adjsize = end - begin; - - for (nnz_lno_t j = 0; j < adjsize; j++) { - size_type aind = j + begin; - nnz_lno_t col = adj[aind]; - if (i < col) { - i_adj[c_xadj[i]++] = eCnt; - i_adj[c_xadj[col]++] = eCnt++; - } - } - } - - for (nnz_lno_t i = 0; i < num_rows; i++) { - if (c_xadj[i] != xadj[i + 1]) { - std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i] - << " xadj[i+1]:" << xadj[i + 1] << std::endl; - } - } -} - -template -inline void kk_sequential_create_incidence_matrix_transpose( - const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj, - const nnz_lno_t *adj, - size_type *i_xadj, // output. preallocated - nnz_lno_t *i_adj // output. preallocated -) { - for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) { - i_xadj[i] = i * 2; - } - int eCnt = 0; - for (nnz_lno_t i = 0; i < num_rows; i++) { - size_type begin = xadj[i]; - size_type end = xadj[i + 1]; - nnz_lno_t adjsize = end - begin; - - for (nnz_lno_t j = 0; j < adjsize; j++) { - size_type aind = j + begin; - nnz_lno_t col = adj[aind]; - if (i < col) { - i_adj[eCnt++] = i; - i_adj[eCnt++] = col; - } - } - } -} - } // namespace Impl } // namespace KokkosKernels diff --git a/src/common/KokkosKernels_SimpleUtils.hpp b/src/common/KokkosKernels_SimpleUtils.hpp index c1f68ebd3b..bb2a6d43b9 100644 --- a/src/common/KokkosKernels_SimpleUtils.hpp +++ b/src/common/KokkosKernels_SimpleUtils.hpp @@ -346,7 +346,7 @@ struct ReduceMaxFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &dst, const volatile value_type &src) const { + void join(value_type &dst, const value_type &src) const { if (dst < src) { dst = src; } diff --git a/src/common/KokkosKernels_Sorting.hpp b/src/common/KokkosKernels_Sorting.hpp index 1cdf1df7ee..8b897047d9 100644 --- a/src/common/KokkosKernels_Sorting.hpp +++ b/src/common/KokkosKernels_Sorting.hpp @@ -61,41 +61,6 @@ struct DefaultComparator { }; } // namespace Impl -// ---------------------------------- -// CRS matrix/graph sorting utilities -// ---------------------------------- - -// The sort_crs* functions sort the adjacent column list for each row into -// ascending order. - -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values); - -template -void sort_crs_matrix(const crsMat_t& A); - -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); - -template -void sort_crs_graph(const crsGraph_t& G); - -// sort_and_merge_matrix produces a new matrix which is equivalent to A but is -// sorted and has no duplicated entries: each (i, j) is unique. Values for -// duplicated entries are summed. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A); - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G); - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out); - // ---------------------------- // General device-level sorting // ---------------------------- @@ -148,240 +113,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2( namespace Impl { -template -struct SortCrsMatrixFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - using values_managed_t = Kokkos::View; - - SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, - const entries_t& entries_, const values_t& values_) - : rowmap(rowmap_), entries(entries_), values(values_) { - if (usingRangePol) { - entriesAux = entries_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), - entries.extent(0)); - valuesAux = values_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), - values.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort2( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, - valuesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort2( - entries.data() + rowStart, values.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; - values_t values; - values_managed_t valuesAux; -}; - -template -struct SortCrsGraphFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using team_mem = typename Kokkos::TeamPolicy::member_type; - // The functor owns memory for entriesAux, so it can't have - // MemoryTraits - using entries_managed_t = Kokkos::View; - - SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, - const entries_t& entries_) - : rowmap(rowmap_), entries(entries_) { - if (usingRangePol) { - entriesAux = entries_managed_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), - entries.extent(0)); - } - // otherwise, aux arrays won't be allocated (sorting in place) - } - - KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - // Radix sort requires unsigned keys for comparison - using unsigned_lno_t = typename std::make_unsigned::type; - KokkosKernels::SerialRadixSort( - (unsigned_lno_t*)entries.data() + rowStart, - (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum); - } - - KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { - size_type i = t.league_rank(); - size_type rowStart = rowmap(i); - size_type rowEnd = rowmap(i + 1); - lno_t rowNum = rowEnd - rowStart; - KokkosKernels::TeamBitonicSort( - entries.data() + rowStart, rowNum, t); - } - - rowmap_t rowmap; - entries_t entries; - entries_managed_t entriesAux; -}; - -template -struct MergedRowmapFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using c_rowmap_t = typename rowmap_t::const_type; - - // Precondition: entries are sorted within each row - MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, - const entries_t& entries_) - : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with - mergedCounts(row) = 0; - return; - } - // Otherwise, the first entry in the row exists - lno_t uniqueEntries = 1; - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (entries(j - 1) != entries(j)) uniqueEntries++; - } - mergedCounts(row) = uniqueEntries; - lnewNNZ += uniqueEntries; - if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; - } - - rowmap_t mergedCounts; - c_rowmap_t rowmap; - entries_t entries; -}; - -template -struct MatrixMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using scalar_t = typename values_t::non_const_value_type; - - // Precondition: entries are sorted within each row - MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const values_t& values_, - const rowmap_t& mergedRowmap_, - const entries_t& mergedEntries_, - const values_t& mergedValues_) - : rowmap(rowmap_), - entries(entries_), - values(values_), - mergedRowmap(mergedRowmap_), - mergedEntries(mergedEntries_), - mergedValues(mergedValues_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - scalar_t accumVal = values(rowBegin); - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol == entries(j)) { - // accumulate - accumVal += values(j); - } else { - // write out and reset - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - insertPos++; - accumVal = values(j); - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedValues(insertPos) = accumVal; - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - values_t values; - rowmap_t mergedRowmap; - entries_t mergedEntries; - values_t mergedValues; -}; - -template -struct GraphMergedEntriesFunctor { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - - // Precondition: entries are sorted within each row - GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, - const rowmap_t& mergedRowmap_, - const entries_t& mergedEntries_) - : rowmap(rowmap_), - entries(entries_), - mergedRowmap(mergedRowmap_), - mergedEntries(mergedEntries_) {} - - KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { - size_type rowBegin = rowmap(row); - size_type rowEnd = rowmap(row + 1); - if (rowEnd == rowBegin) { - // Row was empty to begin with, nothing to do - return; - } - // Otherwise, accumulate the value for each column - lno_t accumCol = entries(rowBegin); - size_type insertPos = mergedRowmap(row); - for (size_type j = rowBegin + 1; j < rowEnd; j++) { - if (accumCol != entries(j)) { - // write out and reset - mergedEntries(insertPos) = accumCol; - insertPos++; - accumCol = entries(j); - } - } - // always left with the last unique entry - mergedEntries(insertPos) = accumCol; - } - - rowmap_t rowmap; - entries_t entries; - rowmap_t mergedRowmap; - entries_t mergedEntries; -}; - // Functor that sorts a view on one team template @@ -517,188 +248,6 @@ struct BitonicPhase2Functor { } // namespace Impl -// Sort a CRS matrix: within each row, sort entries ascending by column. -// At the same time, permute the values. -template -void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, - const values_t& values) { - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsMatrixFunctor - funct(useRadix, rowmap, entries, values); - if (useRadix) { - Kokkos::parallel_for("sort_crs_matrix", - Kokkos::RangePolicy(0, numRows), - funct); - } else { - // Try to get teamsize to be largest power of 2 not greater than avg entries - // per row - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); - } -} - -template -void sort_crs_matrix(const crsMat_t& A) { - // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using exec_space = typename crsMat_t::execution_space; - // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the - // entries and CrsMatrix values are non-const (so sorting them directly - // is allowed) - sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); -} - -// Sort a CRS graph: within each row, sort entries ascending by column. -template -void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { - using lno_t = typename entries_t::non_const_value_type; - using team_pol = Kokkos::TeamPolicy; - bool useRadix = !Impl::kk_is_gpu_exec_space(); - lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; - if (numRows == 0) return; - Impl::SortCrsGraphFunctor funct( - useRadix, rowmap, entries); - if (useRadix) { - Kokkos::parallel_for("sort_crs_graph", - Kokkos::RangePolicy(0, numRows), - funct); - } else { - // Try to get teamsize to be largest power of 2 less than or equal to - // half the entries per row. 0.5 * #entries is bitonic's parallelism within - // a row. - // TODO (probably important for performnce): add thread-level sort also, and - // use that for small avg degree. But this works for now. - lno_t idealTeamSize = 1; - lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; - while (idealTeamSize < avgDeg / 2) { - idealTeamSize *= 2; - } - team_pol temp(numRows, 1); - lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); - lno_t teamSize = std::min(idealTeamSize, maxTeamSize); - Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); - } -} - -template -void sort_crs_graph(const crsGraph_t& G) { - static_assert( - !std::is_const::value, - "sort_crs_graph requires StaticCrsGraph entries to be non-const."); - sort_crs_graph(G.row_map, G.entries); -} - -// Sort the rows of matrix, and merge duplicate entries. -template -crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - using c_rowmap_t = typename crsMat_t::row_map_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - using size_type = typename rowmap_t::non_const_value_type; - using exec_space = typename crsMat_t::execution_space; - using range_t = Kokkos::RangePolicy; - sort_crs_matrix(A); - // Count entries per row into a new rowmap, in terms of merges that can be - // done - rowmap_t mergedRowmap( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - A.numRows() + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, A.numRows()), - Impl::MergedRowmapFunctor( - mergedRowmap, A.graph.row_map, A.graph.entries), - numCompressedEntries); - // Prefix sum to get rowmap - Impl::kk_exclusive_parallel_prefix_sum(A.numRows() + 1, - mergedRowmap); - entries_t mergedEntries("SortedMerged entries", numCompressedEntries); - values_t mergedValues("SortedMerged values", numCompressedEntries); - // Compute merged entries and values - Kokkos::parallel_for( - range_t(0, A.numRows()), - Impl::MatrixMergedEntriesFunctor( - A.graph.row_map, A.graph.entries, A.values, mergedRowmap, - mergedEntries, mergedValues)); - // Finally, construct the new compressed matrix - return crsMat_t("SortedMerged", A.numRows(), A.numCols(), - numCompressedEntries, mergedValues, mergedRowmap, - mergedEntries); -} - -template -void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, - const entries_t& entries_in, rowmap_t& rowmap_out, - entries_t& entries_out) { - using size_type = typename rowmap_t::non_const_value_type; - using lno_t = typename entries_t::non_const_value_type; - using range_t = Kokkos::RangePolicy; - using const_rowmap_t = typename rowmap_t::const_type; - lno_t numRows = rowmap_in.extent(0); - if (numRows <= 1) { - // Matrix has zero rows - rowmap_out = rowmap_t(); - entries_out = entries_t(); - return; - } - numRows--; - // Sort in place - sort_crs_graph(rowmap_in, entries_in); - // Count entries per row into a new rowmap, in terms of merges that can be - // done - rowmap_out = rowmap_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), - numRows + 1); - size_type numCompressedEntries = 0; - Kokkos::parallel_reduce(range_t(0, numRows), - Impl::MergedRowmapFunctor( - rowmap_out, rowmap_in, entries_in), - numCompressedEntries); - // Prefix sum to get rowmap - Impl::kk_exclusive_parallel_prefix_sum(numRows + 1, - rowmap_out); - entries_out = entries_t("SortedMerged entries", numCompressedEntries); - // Compute merged entries and values - Kokkos::parallel_for( - range_t(0, numRows), - Impl::GraphMergedEntriesFunctor( - rowmap_in, entries_in, rowmap_out, entries_out)); -} - -template -crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { - using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; - using entries_t = typename crsGraph_t::entries_type; - static_assert( - !std::is_const::value, - "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); - rowmap_t mergedRowmap; - entries_t mergedEntries; - sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, - mergedEntries); - return crsGraph_t(mergedEntries, mergedRowmap); -} - // Version to be called from host on a single array // Generally ~2x slower than Kokkos::sort() for large arrays (> 50 M elements), // but faster for smaller arrays. @@ -1032,39 +581,6 @@ KOKKOS_INLINE_FUNCTION void TeamBitonicSort2(ValueType* values, PermType* perm, // For backward compatibility: keep the public interface accessible in // KokkosKernels::Impl:: namespace Impl { -template -[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, - const entries_t& entries) { - KokkosKernels::sort_crs_graph(rowmap, - entries); -} - -template -[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, - const entries_t& entries, - const values_t& values) { - KokkosKernels::sort_crs_matrix(rowmap, entries, values); -} - -template -[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { - KokkosKernels::sort_crs_matrix(A); -} - -template -[[deprecated]] void sort_and_merge_graph( - const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, - rowmap_t& rowmap_out, entries_t& entries_out) { - KokkosKernels::sort_and_merge_graph( - rowmap_in, entries_in, rowmap_out, entries_out); -} - -template -[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { - return KokkosKernels::sort_and_merge_matrix(A); -} template < typename View, typename ExecSpace, typename Ordinal, diff --git a/src/common/KokkosKernels_Utils.hpp b/src/common/KokkosKernels_Utils.hpp index 655d89ba67..eae4080879 100644 --- a/src/common/KokkosKernels_Utils.hpp +++ b/src/common/KokkosKernels_Utils.hpp @@ -49,7 +49,6 @@ #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" #include "KokkosKernels_PrintUtils.hpp" #include "KokkosKernels_VectorUtils.hpp" @@ -516,7 +515,7 @@ struct PropogataMaxValstoZeros { } KOKKOS_INLINE_FUNCTION - void join(volatile idx &update, volatile const idx &input) const { + void join(idx &update, const idx &input) const { if (input > update) update = input; } }; @@ -1261,7 +1260,7 @@ struct ReduceRowSizeFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile size_type &dst, const volatile size_type &src) const { + void join(size_type &dst, const size_type &src) const { if (dst < src) { dst = src; } @@ -1306,7 +1305,7 @@ struct ReduceMaxRowFunctor { } } KOKKOS_INLINE_FUNCTION - void join(volatile value_type &dst, const volatile value_type &src) const { + void join(value_type &dst, const value_type &src) const { if (dst < src) { dst = src; } @@ -1351,9 +1350,7 @@ struct IsEqualFunctor { } KOKKOS_INLINE_FUNCTION - void join(volatile int &dst, const volatile int &src) const { - dst = dst & src; - } + void join(int &dst, const int &src) const { dst = dst & src; } KOKKOS_INLINE_FUNCTION void init(int &dst) const { dst = 1; } }; @@ -1467,11 +1464,6 @@ struct array_sum_reduce { for (int i = 0; i < N; i++) data[i] += src.data[i]; return *this; } - KOKKOS_INLINE_FUNCTION // volatile add operator - void - operator+=(const volatile ValueType &src) volatile { - for (int i = 0; i < N; i++) data[i] += src.data[i]; - } }; template diff --git a/src/common/KokkosKernels_default_types.hpp b/src/common/KokkosKernels_default_types.hpp index 4012b2e158..d70a6b27ac 100644 --- a/src/common/KokkosKernels_default_types.hpp +++ b/src/common/KokkosKernels_default_types.hpp @@ -79,6 +79,8 @@ using default_scalar = double; using default_scalar = float; #elif defined(KOKKOSKERNELS_INST_HALF) using default_scalar = Kokkos::Experimental::half_t; +#elif defined(KOKKOSKERNELS_INST_BHALF) +using default_scalar = Kokkos::Experimental::bhalf_t; #else using default_scalar = double; #endif diff --git a/src/common/Kokkos_ArithTraits.hpp b/src/common/Kokkos_ArithTraits.hpp new file mode 100644 index 0000000000..108e845694 --- /dev/null +++ b/src/common/Kokkos_ArithTraits.hpp @@ -0,0 +1,2083 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ARITHTRAITS_HPP +#define KOKKOS_ARITHTRAITS_HPP + +/// \file Kokkos_ArithTraits.hpp +/// \brief Declaration and definition of Kokkos::Details::ArithTraits + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include // std::complex +#include // std::numeric_limits +#ifdef __CUDACC__ +#include +#endif + +namespace { // anonymous + +/// \fn intPowImpl +/// \tparam IntType A built-in integer type. +/// \brief Implementation of intPowSigned and intPowUnsigned. +/// +/// \pre x != 0 +/// \pre y > 0 +/// +/// Use intPowSigned or intPowUnsigned for general y. +template +KOKKOS_FORCEINLINE_FUNCTION IntType intPowImpl(const IntType x, + const IntType y) { + // Recursion (unrolled into while loop): pow(x, 2y) = (x^y)^2 + IntType prod = x; + IntType y_cur = 1; + // If y == 1, then prod stays x. + while (y_cur < y) { + prod = prod * prod; + y_cur = y_cur << 1; + } + // abs(y - y_cur) < floor(log2(y)), so it won't hurt asymptotic run + // time to finish the remainder in a linear iteration. + if (y > y_cur) { + const IntType left = y - y_cur; + for (IntType k = 0; k < left; ++k) { + prod = prod * x; + } + } else if (y < y_cur) { + // There's probably a better way to do this in order to avoid the + // (expensive) integer division, but I'm not motivated to think of + // it at the moment. + const IntType left = y_cur - y; + for (IntType k = 0; k < left; ++k) { + prod = prod / x; + } + } + return prod; + + // y = 8: + // + // x,1 -> x^2,2 + // x^2,2 -> x^4,4 + // x^4,4 -> x^8,8 + // + // y = 9: + // + // x,1 -> x^2,2 + // x^2,2 -> x^4,4 + // x^4,4 -> x^8,8 + // + // y - y_cur is what's left over. Just do it one at a time. + // + // y = 3: + // x,1 -> x^2,2 + // x^2,2 -> x^4,4 +} + +// Warning free abs function for types where we don't know whether they are +// signed (like char) +template ::is_signed> +struct integer_abs { + static KOKKOS_INLINE_FUNCTION T abs(const T& val); +}; + +template +struct integer_abs { + static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x < 0 ? -x : x; } +}; + +template +struct integer_abs { + static KOKKOS_INLINE_FUNCTION T abs(const T& x) { return x; } +}; + +/// \fn intPowSigned +/// \tparam IntType A built-in signed integer type. +/// \brief Compute x raised to the power y. +/// +/// If the arguments are invalid (e.g., if x and y are both zero), the +/// result of this function is undefined. However, this function will +/// not throw an exception in that case. +template +KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if::is_signed, + IntType>::type + intPowSigned(const IntType x, const IntType y) { + // It's not entirely clear what to return if x and y are both zero. + // In the case of floating-point numbers, 0^0 is NaN. Here, though, + // I think it's safe to return 0. + if (x == 0) { + return 0; + } else if (y == 0) { + return 1; + } else if (y < 0) { + if (x == 1) { + return 1; + } else if (x == -1) { + return (y % 2 == 0) ? 1 : -1; + } else { + return 0; // round the fraction to zero + } + } + return intPowImpl(x, y); +} +template +KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if::is_signed, + IntType>::type + intPowSigned(const IntType x, const IntType y) { + // It's not entirely clear what to return if x and y are both zero. + // In the case of floating-point numbers, 0^0 is NaN. Here, though, + // I think it's safe to return 0. + if (x == 0) { + return 0; + } else if (y == 0) { + return 1; + } + return intPowImpl(x, y); +} + +/// \fn intPowUnsigned +/// \tparam IntType A built-in unsigned integer type. +/// \brief Compute x raised to the power y. +/// +/// If the arguments are invalid (e.g., if x and y are both zero), the +/// result of this function is undefined. However, this function will +/// not throw an exception in that case. +template +KOKKOS_FORCEINLINE_FUNCTION IntType intPowUnsigned(const IntType x, + const IntType y) { + // It's not entirely clear what to return if x and y are both zero. + // In the case of floating-point numbers, 0^0 is NaN. Here, though, + // I think it's safe to return 0. + if (x == 0) { + return 0; + } else if (y == 0) { + return 1; + } else { + return intPowImpl(x, y); + } +} + +// It might make sense to use special sqrt() approximations for +// integer arguments, like those presented on the following web site: +// +// http://www.azillionmonkeys.com/qed/sqroot.html#implementations +// +// Note that some of the implementations on the above page break ANSI +// C(++) aliasing rules (by assigning to the results of +// reinterpret_cast-ing between int and float). It's also just a +// performance optimization and not required for a reasonable +// implementation. + +} // namespace + +namespace Kokkos { +namespace Details { + +// Macro to automate the wrapping of Kokkos Mathematical Functions +// in the ArithTraits struct for real floating point types, hopefully +// this can be expanded to Kokkos::half_t and Kokkos::bhalf_t +#define KOKKOSKERNELS_ARITHTRAITS_REAL_FP(FUNC_QUAL) \ + static FUNC_QUAL val_type zero() { return static_cast(0); } \ + static FUNC_QUAL val_type one() { return static_cast(1); } \ + static FUNC_QUAL val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static FUNC_QUAL val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static FUNC_QUAL val_type infinity() { \ + return Kokkos::Experimental::infinity::value; \ + } \ + static FUNC_QUAL val_type nan() { \ + return Kokkos::Experimental::quiet_NaN::value; \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return Kokkos::Experimental::epsilon::value; \ + } \ + static FUNC_QUAL mag_type sfmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int base() { \ + return Kokkos::Experimental::radix::value; \ + } \ + static FUNC_QUAL mag_type prec() { \ + return epsilon() * static_cast(base()); \ + } \ + static FUNC_QUAL int t() { \ + return Kokkos::Experimental::digits::value; \ + } \ + static FUNC_QUAL mag_type rnd() { return one(); } \ + static FUNC_QUAL int emin() { \ + return Kokkos::Experimental::min_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmin() { \ + return Kokkos::Experimental::norm_min::value; \ + } \ + static FUNC_QUAL int emax() { \ + return Kokkos::Experimental::max_exponent::value; \ + } \ + static FUNC_QUAL mag_type rmax() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + \ + static FUNC_QUAL bool isInf(const val_type x) { return Kokkos::isinf(x); } \ + static FUNC_QUAL bool isNan(const val_type x) { return Kokkos::isnan(x); } \ + static FUNC_QUAL mag_type abs(const val_type x) { return Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return Kokkos::real(x); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return Kokkos::imag(x); } \ + static FUNC_QUAL val_type conj(const val_type x) { return x; } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { return Kokkos::sqrt(x); } \ + static FUNC_QUAL val_type cbrt(const val_type x) { return Kokkos::cbrt(x); } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + \ + static FUNC_QUAL bool isnaninf(const val_type x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + +#define KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(FUNC_QUAL) \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_signed = true; \ + static constexpr bool is_integer = false; \ + static constexpr bool is_exact = false; \ + static constexpr bool is_complex = true; \ + static constexpr bool has_infinity = true; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = \ + ::Kokkos::complex::halfPrecision>; \ + using doublePrecision = \ + ::Kokkos::complex::doublePrecision>; \ + \ + static constexpr bool isComplex = true; \ + static constexpr bool isOrdinal = false; \ + static constexpr bool isComparable = false; \ + static constexpr bool hasMachineParameters = \ + ArithTraits::hasMachineParameters; \ + \ + static FUNC_QUAL val_type zero() { \ + return val_type(ArithTraits::zero(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type one() { \ + return val_type(ArithTraits::one(), \ + ArithTraits::zero()); \ + } \ + static FUNC_QUAL val_type min() { \ + return val_type(ArithTraits::min(), \ + ArithTraits::min()); \ + } \ + static FUNC_QUAL val_type max() { \ + return val_type(ArithTraits::max(), \ + ArithTraits::max()); \ + } \ + static FUNC_QUAL val_type infinity() { \ + return val_type(ArithTraits::infinity(), \ + ArithTraits::infinity()); \ + } \ + static FUNC_QUAL val_type nan() { \ + return val_type(ArithTraits::nan(), \ + ArithTraits::nan()); \ + } \ + static FUNC_QUAL mag_type epsilon() { \ + return ArithTraits::epsilon(); \ + } \ + static FUNC_QUAL mag_type sfmin() { return ArithTraits::sfmin(); } \ + static FUNC_QUAL int base() { return ArithTraits::base(); } \ + static FUNC_QUAL mag_type prec() { return ArithTraits::prec(); } \ + static FUNC_QUAL int t() { return ArithTraits::t(); } \ + static FUNC_QUAL mag_type rnd() { return ArithTraits::rnd(); } \ + static FUNC_QUAL int emin() { return ArithTraits::emin(); } \ + static FUNC_QUAL mag_type rmin() { return ArithTraits::rmin(); } \ + static FUNC_QUAL int emax() { return ArithTraits::emax(); } \ + static FUNC_QUAL mag_type rmax() { return ArithTraits::rmax(); } \ + static FUNC_QUAL bool isInf(const val_type x) { \ + return ArithTraits::isInf(x.real()) || \ + ArithTraits::isInf(x.imag()); \ + } \ + static FUNC_QUAL bool isNan(const val_type x) { \ + return ArithTraits::isNan(x.real()) || \ + ArithTraits::isNan(x.imag()); \ + } \ + static FUNC_QUAL mag_type abs(const val_type x) { return ::Kokkos::abs(x); } \ + static FUNC_QUAL mag_type real(const val_type x) { return x.real(); } \ + static FUNC_QUAL mag_type imag(const val_type x) { return x.imag(); } \ + static FUNC_QUAL val_type conj(const val_type x) { \ + return ::Kokkos::conj(x); \ + } \ + static FUNC_QUAL val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow(const val_type x, const mag_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type pow(const mag_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static FUNC_QUAL val_type sqrt(const val_type x) { \ + return ::Kokkos::sqrt(x); \ + } \ + static FUNC_QUAL val_type exp(const val_type x) { return Kokkos::exp(x); } \ + static FUNC_QUAL val_type log(const val_type x) { return Kokkos::log(x); } \ + static FUNC_QUAL val_type log10(const val_type x) { \ + return Kokkos::log10(x); \ + } \ + static FUNC_QUAL val_type sin(const val_type x) { return Kokkos::sin(x); } \ + static FUNC_QUAL val_type cos(const val_type x) { return Kokkos::cos(x); } \ + static FUNC_QUAL val_type tan(const val_type x) { return Kokkos::tan(x); } \ + static FUNC_QUAL val_type sinh(const val_type x) { return Kokkos::sinh(x); } \ + static FUNC_QUAL val_type cosh(const val_type x) { return Kokkos::cosh(x); } \ + static FUNC_QUAL val_type tanh(const val_type x) { return Kokkos::tanh(x); } \ + static FUNC_QUAL val_type asin(const val_type x) { return Kokkos::asin(x); } \ + static FUNC_QUAL val_type acos(const val_type x) { return Kokkos::acos(x); } \ + static FUNC_QUAL val_type atan(const val_type x) { return Kokkos::atan(x); } \ + static FUNC_QUAL bool isnaninf(const val_type& x) { \ + return isNan(x) || isInf(x); \ + } \ + static FUNC_QUAL mag_type magnitude(const val_type x) { return abs(x); } \ + static FUNC_QUAL val_type conjugate(const val_type x) { return conj(x); } \ + static FUNC_QUAL val_type squareroot(const val_type x) { return sqrt(x); } \ + static FUNC_QUAL mag_type eps() { return epsilon(); } + +template +static KOKKOS_FUNCTION + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsAbs(const val_type x) { + return Kokkos::abs(x); +} + +template +static KOKKOS_FUNCTION + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsAbs(const val_type x) { + return x; +} + +template +static KOKKOS_FUNCTION + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsNan() { + return -1; +} + +template +static KOKKOS_FUNCTION + typename std::enable_if::is_signed, + val_type>::type + KokkosKernelsNan() { + return Kokkos::Experimental::finite_max::value; +} + +#define KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() \ + \ + static constexpr bool is_specialized = true; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_exact = true; \ + static constexpr bool is_complex = false; \ + static constexpr bool has_infinity = false; \ + \ + using magnitudeType = mag_type; \ + using halfPrecision = val_type; \ + using doublePrecision = val_type; \ + \ + static constexpr bool isComplex = false; \ + static constexpr bool isOrdinal = true; \ + static constexpr bool isComparable = true; \ + static constexpr bool hasMachineParameters = false; \ + \ + static KOKKOS_FUNCTION val_type zero() { return static_cast(0); } \ + static KOKKOS_FUNCTION val_type one() { return static_cast(1); } \ + static KOKKOS_FUNCTION val_type min() { \ + return Kokkos::Experimental::finite_min::value; \ + } \ + static KOKKOS_FUNCTION val_type max() { \ + return Kokkos::Experimental::finite_max::value; \ + } \ + static KOKKOS_FUNCTION val_type infinity() { \ + return static_cast(0); \ + } \ + static KOKKOS_FUNCTION val_type nan() { \ + return KokkosKernelsNan(); \ + } \ + static KOKKOS_FUNCTION bool isInf(const val_type) { return false; } \ + static KOKKOS_FUNCTION bool isNan(const val_type) { return false; } \ + static KOKKOS_FUNCTION mag_type abs(const val_type x) { \ + return KokkosKernelsAbs(x); \ + } \ + static KOKKOS_FUNCTION mag_type real(const val_type x) { \ + return Kokkos::real(x); \ + } \ + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } \ + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } \ + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { \ + return Kokkos::pow(x, y); \ + } \ + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { \ + return static_cast(Kokkos::sqrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { \ + return static_cast(Kokkos::cbrt(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type exp(const val_type x) { \ + return static_cast(Kokkos::exp(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log(const val_type x) { \ + return static_cast(Kokkos::log(abs(x))); \ + } \ + static KOKKOS_FUNCTION val_type log10(const val_type x) { \ + return static_cast(Kokkos::log10(abs(x))); \ + } \ + static KOKKOS_FUNCTION mag_type epsilon() { return zero(); } \ + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { \ + return abs(x); \ + } \ + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { \ + return conj(x); \ + } \ + static KOKKOS_FUNCTION bool isnaninf(const val_type) { return false; } \ + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { \ + return sqrt(x); \ + } + +/// \class ArithTraits +/// \brief Traits class for arithmetic on type T. +/// \tparam T "Scalar" type of interest +/// +/// This is a traits class for the "arithmetic" type T. "Arithmetic +/// types" include built-in signed and unsigned integer types, +/// floating-point types, complex-valued types, and anything else that +/// looks like these. This class is useful for implementing numerical +/// algorithms that are generic on the data type. You may also use +/// this class to query attributes of T, like whether it is signed or +/// complex, or its precision. +/// +/// We really did not want to implement this class or expose it to +/// users. It would be much better to use existing traits classes +/// like std::numeric_limits. We decided to implement and expose this +/// class for the following reasons: +///
    +///
  1. std::numeric_limits class methods cannot be used in CUDA +/// device functions, since they themselves are not device +/// functions
  2. +///
  3. Existing traits classes like std::numeric_limits do not +/// provide enough information to implement algorithms that are +/// agnostic of whether T is real-valued or complex-valued.
  4. +///
+/// +/// All class methods must be suitable for parallel kernels, if the +/// type T itself is suitable for parallel kernels. In particular, +/// specializations for types T that make sense to use on a CUDA +/// device must mark all class methods as device (and host) functions, +/// using the KOKKOS_FORCEINLINE_FUNCTION macro. All class methods must be +/// callable both inside and outside a parallel kernel (for CUDA, this +/// means they must be marked as both device and host functions). +/// +/// \section Kokkos_ArithTraits_compat Compatibility +/// +/// Whenever possible, class methods in ArithTraits use the same names +/// as their equivalents in the C++ Standard Library. If this was not +/// possible, for example with isInf and isNan, we explain why in +/// their documentation. +/// +/// This class has redundant typedefs and methods in order to maintain +/// backwards compatibility with Teuchos::ScalarTraits, while +/// preferring forwards (partial) compatibility with +/// std::numeric_limits. Users should prefer typedefs, \c bool +/// constants, and class methods compatible with std::numeric_limits, +/// to those from Teuchos::ScalarTraits. The latter may go away at +/// any time. Furthermore, Teuchos::ScalarTraits contains methods +/// that do not make sense for use as parallel device functions, in +/// particular those relating to pseudorandom number generation that +/// refer to hidden state, so we will never include all class methods +/// from Teuchos::ScalarTraits in ArithTraits. +/// +/// \section Kokkos_ArithTraits_unsupp Unsupported types on CUDA devices +/// +/// CUDA does not support long double or std::complex in device +/// functions. ArithTraits does have specializations for these types, +/// but the class methods therein are not marked as device functions. +/// +/// \section Kokkos_ArithTraits_whyNotC99 What about C99 integer types? +/// +/// C99 and C++11 include typedefs int${N}_t and uint${N}_t, where N +/// is the number of bits in the integer. These typedefs are useful +/// because they make the length of the type explicit. Users are +/// welcome to use these types as the template parameter of +/// ArithTraits. +/// +/// We chose not to use these types when defining full +/// specializations of ArithTraits. This is because the C99 integer +/// types are typedefs, not types in themselves. This makes it +/// impossible to avoid duplicate or missing full specializations of +/// ArithTraits. For example, on my Mac, for CUDA 5.5, gcc 4.2.1, and +/// Clang 3.2, int64_t is a typedef of long long, +/// but long long and long are separate types, even +/// though they have the same length (64 bits). In contrast, on +/// Windows (even Win64), long is a 32-bit type (but a +/// distinct type from int), and long long is a +/// 64-bit type. Thus, if we define full specializations of +/// ArithTraits using only the C99 integer types, we will be +/// missing a specialization for long on at least one +/// platform. +/// +/// Rather than trouble ourselves with trying to figure this out for +/// each platform, we decided to provide specializations only for the +/// integer types in the C89 and C++03 language standards. This +/// includes signed and unsigned versions of char, +/// short, int, and long. We also include +/// long long if your platform supports it. We may thus have +/// left out some C99 integer type, but this is only possible if the +/// C89 / C++03 integer types do not have complete coverage of all +/// powers of two bits from 8 up to the longest provided length (e.g., +/// 64 on a 64-bit system). On all platforms I have encountered, +/// char has 8 bits and short has 16 bits, so I am +/// not worried about missing specializations for int16_t or +/// uint16_t. If you should find that either of these +/// specializations are missing, though, please let us know. +/// +/// Note that char, signed char, and unsigned +/// char are distinct types, whether char is signed or +/// unsigned. (The language standards do not specify whether +/// char is signed or unsigned.) That is, char is +/// not a typedef of signed char or unsigned +/// char. This is why we provide full specializations of +/// ArithTraits for each of these types. Interestingly enough, on my +/// system, char and int8_t are different types, but +/// signed char and int8_t are the same. +/// +/// \section Kokkos_ArithTraits_impl Implementation notes +/// +/// This section contains notes to developers who which to add a +/// partial specialization of this class for a new type T. If you +/// decide to write a default templated implementation, it must not +/// declare any methods as device functions. This ensures correct +/// behavior for arbitrary T, but does require specializations for +/// common types like T = float and double, as well as for other types +/// T that make sense to use on a CUDA device. +template +class ArithTraits { + public: + /// \brief A type that acts like T and works with Kokkos. + /// + /// This is usually just an alias for T. However, some types T do + /// not work well with Kokkos. In that case, we use a mostly + /// equivalent type here. For example, ArithTraits + /// >::val_type is Kokkos::complex. + using val_type = T; + /// \brief The type of the magnitude (absolute value) of T. + /// + /// We define this as the type returned by abs() in this class. If + /// T is real (not complex), then \c val_type and \c mag_type are + /// usually the same. If T is std::complex for some R, + /// then R and \c mag_type are usually the same. + using mag_type = T; + + //! Whether ArithTraits has a specialization for T. + static constexpr bool is_specialized = false; + //! Whether T is a signed type (has negative values). + static constexpr bool is_signed = false; + //! Whether T is an integer type. + static constexpr bool is_integer = false; + /// \brief Whether T "uses exact representations." + /// + /// The opposite of is_exact is "is approximate," that is, "may + /// commit rounding error." + static constexpr bool is_exact = false; + //! Whether T is a complex-valued type. + static constexpr bool is_complex = false; + + /// \brief Whether x is Inf. + /// + /// This can only be true for floating-point types T that support + /// Inf. If T is a complex type, we say that a T instance x is Inf + /// if and only if isinf(real(x)) || isinf(imag(x)). + /// + /// Unfortunately we can't call this "isinf" (the equivalent C99 + /// function), because CUDA appears to implement that function using + /// a macro, rather than using a function (as C++11 requires). + static KOKKOS_FUNCTION bool isInf(const T& x); + + /// \brief Whether x is NaN (not a number). + /// + /// This can only be true for floating-point types T that support + /// NaN. If T is a complex type, we say that a T instance x is NaN + /// if and only if isNan(real(x)) || isNan(imag(x)). + /// + /// Unfortunately we can't call this "isnan" (the equivalent C99 + /// function), because CUDA appears to implement that function using + /// a macro, rather than using a function (as C++11 requires). + static KOKKOS_FUNCTION bool isNan(const T& x); + + //! The absolute value (magnitude) of x. + static KOKKOS_FUNCTION mag_type abs(const T& x); + + //! The zero value of T; the arithmetic identity. + static KOKKOS_FUNCTION T zero(); + + //! The one value of T; the multiplicative identity. + static KOKKOS_FUNCTION T one(); + + /// \brief True if this type T is capable of representing the + /// positive infinity as a distinct special value, as with + /// std::numeric_limits::has_infinity. + static constexpr bool has_infinity = false; + + /// \brief Returns the special value "positive infinity", as + /// represented by the floating-point type T. Only meaningful if + /// KokkosArithTraits::has_infinity == true. Provides same + /// functionality as std::numeric_limits::infinity(). + /// + /// \note Would have liked to mark it as constexpr but then would + /// not be able to provide the specialization for std::complex + /// since its constructor only becomes constexpr with C++14. + static KOKKOS_FUNCTION T infinity(); + + /// \brief The minimum possible value of T. + /// + /// If T is a real floating-point type, then this is the minimum + /// positive value, as with std::numeric_limits::min(). + static KOKKOS_FUNCTION T min(); + + //! The maximum possible value of T. + static KOKKOS_FUNCTION T max(); + + /// \brief The real part of x. + /// + /// If \c is_complex is false, then this just returns x. + static KOKKOS_FUNCTION mag_type real(const T& x); + + /// \brief The imaginary part of x. + /// + /// If \c is_complex is false, then this just returns zero(). + static KOKKOS_FUNCTION mag_type imag(const T&); + + /// \brief The complex conjugate of x. + /// + /// If \c is_complex is false, then this just returns x. + static KOKKOS_FUNCTION T conj(const T&); + + //! x raised to the power y. + static KOKKOS_FUNCTION T pow(const T& x, const T& y); + + /// \brief The square root of x. + /// + /// If T is an integer type, this is the floor of the square root. + /// If T is a complex-valued type, then this method returns the + /// principal branch of the square root. + /// + /// If T is real-valued and x is negative, the result of the square + /// root is undefined in general. (CUDA does not allow throwing + /// exceptions in device functions.) Implementations should return + /// NaN if the type T supports this. Of course, in that case, the + /// square of the result will not equal x. + static KOKKOS_FUNCTION T sqrt(const T& x); + + /// \brief The cubic root of x. + /// + /// If T is an integer type, this is the floor of the cubic root. + /// If T is a complex-valued type, then this method returns the + /// principal branch of the cubic root. + /// + /// If T is real-valued and x is negative, the result of the cubic + /// root is undefined in general. (CUDA does not allow throwing + /// exceptions in device functions.) Implementations should return + /// NaN if the type T supports this. Of course, in that case, the + /// cubic of the result will not equal x. + static KOKKOS_FUNCTION T cbrt(const T& x); + + /// \brief The natural (base e) exponential function of x. + /// + /// If T is an integer type, this is the floor of the exponential + /// function. If T is a complex-valued type, then this method + /// returns \f$e^{x+iy} = e^x ( cos(y) + i sin(y) )\f$. + /// + static KOKKOS_FUNCTION T exp(const T& x); + + /// \brief The natural (base e) logarithm of x. + /// + /// If T is an integer type, this is the floor of the logarithm. If + /// T is a complex-valued type, then this method returns the + /// principal branch of the logarithm. + /// + /// If T is real-valued and x is negative, the result of the + /// logarithm is undefined in general. (CUDA does not allow + /// throwing exceptions in device functions.) Implementations + /// should return NaN if the type T supports this. Of course, in + /// that case, if y is the result, \f$e^y\f$ will not equal x. + static KOKKOS_FUNCTION T log(const T& x); + + /// \brief The base ten logarithm of the input. + /// + /// If T is an integer type, this is the floor of the logarithm. If + /// T is a complex-valued type, then this method returns the + /// principal branch of the logarithm. + /// + /// If T is real-valued and x is negative, the result of the + /// logarithm is undefined in general. (CUDA does not allow + /// throwing exceptions in device functions.) Implementations + /// should return NaN if the type T supports this. Of course, in + /// that case, if y is the result, \f$10^y\f$ will not equal x. + static KOKKOS_FUNCTION T log10(const T& x); + + /// Trigonometric and hyperbolic functions are not available + /// for integer types. This is because asin(sin(x)) is not x + /// when x is integer with a rounding error. + /// + /// KJ: log, exp also has this problem. We probably need to + /// disable them for integer types instead of providing + /// functionality with floor. + + /// \brief The sin function of x + /// + static KOKKOS_FUNCTION T sin(const T& x); + + /// \brief The cos function of x + /// + static KOKKOS_FUNCTION T cos(const T& x); + + /// \brief The tan function of x + /// + static KOKKOS_FUNCTION T tan(const T& x); + + /// \brief The sin hyperbolic function of x + /// + static KOKKOS_FUNCTION T sinh(const T& x); + + /// \brief The cos hyperbolic function of x + /// + static KOKKOS_FUNCTION T cosh(const T& x); + + /// \brief The tan hyperbolic function of x + /// + static KOKKOS_FUNCTION T tanh(const T& x); + + /// \brief The asin function of x + /// + static KOKKOS_FUNCTION T asin(const T& x); + + /// \brief The acos function of x + /// + static KOKKOS_FUNCTION T acos(const T& x); + + /// \brief The atan function of x + /// + static KOKKOS_FUNCTION T atan(const T& x); + + /// \brief Return a silent NaN, if appropriate for T. + /// + /// If T does not implement a silent NaN, the return value is + /// undefined, but calling this method is still allowed. + static KOKKOS_FUNCTION T nan(); + + /// \brief Machine epsilon. + /// + /// If T is an integer type (std::numeric_traits::is_exact is + /// true), then epsilon() returns 0. Otherwise, if T is a + /// floating-point type, it returns machine epsilon that T. + static KOKKOS_FUNCTION mag_type epsilon(); + + //@{ + /// \name Traits defined for backwards compatibility with + /// Teuchos::ScalarTraits + /// + /// All of the typedefs, \c bool constants, and class methods in + /// this section are defined in order that one may replace most uses + /// of Teuchos::ScalarTraits with ArithTraits. Users who do not + /// have this backwards compatibility requirement should prefer + /// equivalents in other sections. Those class methods which have + /// the same name and meaning in both Teuchos::ScalarTraits and this + /// class, such as log() and pow(), are not in this section. + + //! Same as mag_type; the type of the absolute value (magnitude) of T. + using magnitudeType = T; + + /// \brief The type with "half the precision" of T. + /// + /// This typedef only makes sense if T is a floating-point type. + using halfPrecision = T; + + /// \brief The type with "twice the the precision" of T. + /// + /// This typedef only makes sense if T is a floating-point type. + using doublePrecision = T; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + + /// \brief True if this type T has floating-point parameters. + /// + /// This is true if and only if this specialization of ArithTraits + /// has "machine-specific" parameters eps(), sfmin(), base(), + /// prec(), t(), rnd(), emin(), rmin(), emax(), and rmax(), relating + /// to floating-point types. + static constexpr bool hasMachineParameters = false; + + //! Return relative machine precision. + static KOKKOS_FUNCTION mag_type eps(); + + //! Return safe minimum (sfmin), such that 1/sfmin does not overflow. + static KOKKOS_FUNCTION mag_type sfmin(); + + //! Return the base of the scalar type T. + static KOKKOS_FUNCTION int base(); + + //! Return eps*base. + static KOKKOS_FUNCTION mag_type prec(); + + //! Returns the number of (base) digits in the significand. + static KOKKOS_FUNCTION int t(); + + //! 1.0 when rounding occurs in addition, else 0.0. + static KOKKOS_FUNCTION mag_type rnd(); + + //! Returns the minimum exponent before (gradual) underflow. + static KOKKOS_FUNCTION int emin(); + + //! Returns the underflow threshold: base^(emin-1) + static KOKKOS_FUNCTION mag_type rmin(); + + //! Returns the largest exponent before overflow. + static KOKKOS_FUNCTION int emax(); + + //! Overflow theshold: (base^emax)*(1-eps) + static KOKKOS_FUNCTION mag_type rmax(); + + //! Same as abs(); return the magnitude of x. + static KOKKOS_FUNCTION magnitudeType magnitude(const T& x); + + //! Same as conj(); return the complex conjugate of x. + static KOKKOS_FUNCTION T conjugate(const T& x); + + /// \brief Whether x is (silent) NaN or Inf. + /// + /// This is the same as isNan(x) || isInf(x). + static KOKKOS_FUNCTION bool isnaninf(const T& x); + + /// \brief The string name of T. + /// + /// Note that this is not a device function. + static std::string name(); + + //! Same as sqrt(x); the square root of x. + static KOKKOS_FUNCTION T squareroot(const T& x); + //@} +}; + +// Since Kokkos::Experimental::half_t falls back to float, only define +// ArithTraits if half_t is a backend specialization +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +template <> +class ArithTraits { + public: + using val_type = Kokkos::Experimental::half_t; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + + static constexpr bool has_infinity = true; + static KOKKOS_FUNCTION val_type infinity() { + return Kokkos::Experimental::cast_to_half( + Kokkos::Experimental::infinity::value); + } + + static KOKKOS_FUNCTION bool isInf(const val_type x) { +#ifndef __CUDA_ARCH__ + using std::isinf; +#endif + return isinf(Kokkos::Experimental::cast_from_half(x)); + } + static KOKKOS_FUNCTION bool isNan(const val_type x) { +#ifndef __CUDA_ARCH__ + using std::isnan; +#endif + return isnan(Kokkos::Experimental::cast_from_half(x)); + } + static KOKKOS_FUNCTION mag_type abs(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::abs(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type zero() { + return Kokkos::Experimental::cast_to_half(0.0); + } + static KOKKOS_FUNCTION val_type one() { + return Kokkos::Experimental::cast_to_half(1.0); + } + static KOKKOS_FUNCTION val_type min() { + return Kokkos::Experimental::cast_to_half(-KOKKOSKERNELS_IMPL_FP16_MAX); + } + static KOKKOS_FUNCTION val_type max() { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); + } + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } + static KOKKOS_FUNCTION mag_type imag(const val_type) { return zero(); } + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { + return Kokkos::Experimental::cast_to_half( + Kokkos::pow(Kokkos::Experimental::cast_from_half(x), + Kokkos::Experimental::cast_from_half(y))); + } + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::sqrt(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::cbrt(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type exp(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::exp(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type log(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::log(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type log10(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::log10(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type sin(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::sin(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type cos(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::cos(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type tan(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::tan(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::sinh(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::cosh(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::tanh(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type asin(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::asin(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type acos(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::acos(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION val_type atan(const val_type x) { + return Kokkos::Experimental::cast_to_half( + Kokkos::atan(Kokkos::Experimental::cast_from_half(x))); + } + static KOKKOS_FUNCTION mag_type epsilon() { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_EPSILON); + } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + // C++ doesn't have a standard "half-float" type. + using halfPrecision = val_type; + using doublePrecision = double; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + static KOKKOS_FUNCTION bool isnaninf(const val_type x) { + return isNan(x) || isInf(x); + } + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { + return abs(x); + } + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { + return conj(x); + } + static std::string name() { return "half"; } + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { + return sqrt(x); + } + static KOKKOS_FUNCTION val_type nan() { + return Kokkos::Experimental::cast_to_half( + Kokkos::Experimental::quiet_NaN::value); + } + static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } + static KOKKOS_FUNCTION mag_type sfmin() { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); + } + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_FP16_RADIX; } + // Use float to allow running on both host and device + static KOKKOS_FUNCTION float prec() { + float e = KOKKOSKERNELS_IMPL_FP16_EPSILON; + float b = (float)base(); + float r = e * b; + return r; + } + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_FP16_MANT_DIG; } + static KOKKOS_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_FP16_MIN_EXP; } + static KOKKOS_FUNCTION mag_type rmin() { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MIN); + } + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_FP16_MAX_EXP; } + static KOKKOS_FUNCTION mag_type rmax() { + return Kokkos::Experimental::cast_to_half(KOKKOSKERNELS_IMPL_FP16_MAX); + } +}; +#endif // KOKKOS_HALF_T_IS_FLOAT && KOKKOS_ENABLE_CUDA_HALF + +// Since Kokkos::Experimental::bhalf_t falls back to float, only define +// ArithTraits if bhalf_t is a backend specialization +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +template <> +class ArithTraits { + public: + using val_type = Kokkos::Experimental::bhalf_t; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + + static constexpr bool has_infinity = true; + static KOKKOS_FUNCTION val_type infinity() { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::Experimental::infinity::value); + } + + static KOKKOS_FUNCTION bool isInf(const val_type x) { + return Kokkos::isinf(Kokkos::Experimental::cast_from_bhalf(x)); + } + static KOKKOS_FUNCTION bool isNan(const val_type x) { + return Kokkos::isnan(Kokkos::Experimental::cast_from_bhalf(x)); + } + static KOKKOS_FUNCTION mag_type abs(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::abs(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type zero() { + return Kokkos::Experimental::cast_to_bhalf(0.0F); + } + static KOKKOS_FUNCTION val_type one() { + return Kokkos::Experimental::cast_to_bhalf(1.0F); + } + static KOKKOS_FUNCTION val_type min() { + return Kokkos::Experimental::cast_to_bhalf(-KOKKOSKERNELS_IMPL_BF16_MAX); + } + static KOKKOS_FUNCTION val_type max() { + return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); + } + static KOKKOS_FUNCTION mag_type real(const val_type x) { return x; } + static KOKKOS_FUNCTION mag_type imag(const val_type) { + return Kokkos::Experimental::cast_to_bhalf(0.0F); + } + static KOKKOS_FUNCTION val_type conj(const val_type x) { return x; } + static KOKKOS_FUNCTION val_type pow(const val_type x, const val_type y) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::pow(Kokkos::Experimental::cast_from_bhalf(x), + Kokkos::Experimental::cast_from_bhalf(y))); + } + static KOKKOS_FUNCTION val_type sqrt(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::sqrt(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type cbrt(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::cbrt(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type exp(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::exp(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type log(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::log(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type log10(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::log10(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type sin(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::sin(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type cos(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::cos(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type tan(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::tan(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::sinh(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::cosh(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::tanh(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type asin(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::asin(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type acos(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::acos(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION val_type atan(const val_type x) { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::atan(Kokkos::Experimental::cast_from_bhalf(x))); + } + static KOKKOS_FUNCTION mag_type epsilon() { + // return ::pow(2, -KOKKOSKERNELS_IMPL_BF16_SIGNIFICAND_BITS); + return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_EPSILON); + } + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + // C++ doesn't have a standard "bhalf-float" type. + using bhalfPrecision = val_type; + using doublePrecision = double; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + static KOKKOS_FUNCTION bool isnaninf(const val_type x) { + return isNan(x) || isInf(x); + } + static KOKKOS_FUNCTION magnitudeType magnitude(const val_type x) { + return abs(x); + } + static KOKKOS_FUNCTION val_type conjugate(const val_type x) { + return conj(x); + } + static std::string name() { return "bhalf"; } + static KOKKOS_FUNCTION val_type squareroot(const val_type x) { + return sqrt(x); + } + static KOKKOS_FUNCTION val_type nan() { + return Kokkos::Experimental::cast_to_bhalf( + Kokkos::Experimental::quiet_NaN::value); + } + static KOKKOS_FUNCTION mag_type eps() { return epsilon(); } + static KOKKOS_FUNCTION mag_type sfmin() { + return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); + } + static KOKKOS_FUNCTION int base() { return KOKKOSKERNELS_IMPL_BF16_RADIX; } + // Use float to allow running on both host and device + static KOKKOS_FUNCTION float prec() { + float e = KOKKOSKERNELS_IMPL_BF16_EPSILON; + float b = (float)base(); + float r = e * b; + return r; + } + static KOKKOS_FUNCTION int t() { return KOKKOSKERNELS_IMPL_BF16_MANT_DIG; } + static KOKKOS_FUNCTION mag_type rnd() { return one(); } + static KOKKOS_FUNCTION int emin() { return KOKKOSKERNELS_IMPL_BF16_MIN_EXP; } + static KOKKOS_FUNCTION mag_type rmin() { + return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MIN); + } + static KOKKOS_FUNCTION int emax() { return KOKKOSKERNELS_IMPL_BF16_MAX_EXP; } + static KOKKOS_FUNCTION mag_type rmax() { + return Kokkos::Experimental::cast_to_bhalf(KOKKOSKERNELS_IMPL_BF16_MAX); + } +}; +#endif // KOKKOS_BHALF_T_IS_FLOAT + +template <> +class ArithTraits { + public: + using val_type = float; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = float; // Should we switch to Kokkos::half_t + using doublePrecision = double; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "float"; } + + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +}; + +template <> +class ArithTraits { + public: + using val_type = double; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = float; +#if defined(__CUDA_ARCH__) + using doublePrecision = + double; // CUDA doesn't support long double, unfortunately +#elif defined(__HIP_DEVICE_COMPILE__) + using doublePrecision = + double; // HIP does not support long double unfortunately +#else + using doublePrecision = long double; +#endif // __CUDA_ARCH__ + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "double"; } + + KOKKOSKERNELS_ARITHTRAITS_REAL_FP(KOKKOS_FUNCTION) +}; + +// CUDA and HIP do not support long double in device functions, +// so none of the class methods in this specialization are marked +// as device functions. +template <> +class ArithTraits { + public: + using val_type = long double; + using mag_type = long double; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = double; + // It might be appropriate to use QD's qd_real here. + // For now, long double is the most you get. + using doublePrecision = val_type; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "long double"; } + + KOKKOSKERNELS_ARITHTRAITS_REAL_FP() +}; // long double specialization + +#if defined(KOKKOS_ENABLE_LIBQUADMATH) +// CUDA does not support __float128 in device functions, so none of +// the class methods in this specialization are marked as device +// functions. +template <> +class ArithTraits<__float128> { + public: + using val_type = __float128; + using mag_type = val_type; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = false; + static constexpr bool has_infinity = true; + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = double; + // Unfortunately, we can't rely on a standard __float256 type. + using doublePrecision = __float128; + + static constexpr bool isComplex = false; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = true; + static constexpr bool hasMachineParameters = true; + + static std::string name() { return "__float128"; } + + KOKKOSKERNELS_ARITHTRAITS_REAL_FP() +}; // __float128 specialization +#endif // KOKKOS_ENABLE_LIBQUADMATH + +template <> +class ArithTraits< ::Kokkos::complex > { + public: + using val_type = ::Kokkos::complex; + using mag_type = float; + + static std::string name() { return "Kokkos::complex"; } + + KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) +}; + +template <> +class ArithTraits< ::Kokkos::complex > { + public: + using val_type = ::Kokkos::complex; + using mag_type = double; + + static std::string name() { return "Kokkos::complex"; } + + KOKKOSKERNELS_ARITHTRAITS_CMPLX_FP(KOKKOS_FUNCTION) +}; + +/// \brief Partial specialization for std::complex. +/// +/// The C++ Standard Library (with C++03 at least) only allows +/// std::complex for RealFloatType = float, double, or +/// long double. +template +class ArithTraits > { + public: + //! Kokkos internally replaces std::complex with Kokkos::complex. + using val_type = ::Kokkos::complex; + using mag_type = RealFloatType; + + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool is_complex = true; + + static constexpr bool has_infinity = true; + static std::complex infinity() { + return std::complex(ArithTraits::infinity(), + ArithTraits::infinity()); + } + +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isinf; +#endif + return isinf(real(x)) || isinf(imag(x)); + } + template <> + static bool isInf(const std::complex& x) { + Kokkos::abort("isInf not available for std::complex!\n"); + return true; + } +#else + static bool isInf(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isinf; +#endif + return isinf(real(x)) || isinf(imag(x)); + } +#endif +#ifdef KOKKOS_ENABLE_SYCL + template + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL) + using sycl::isnan; +#endif + return isnan(real(x)) || isnan(imag(x)); + } + template <> + static bool isNan(const std::complex& x) { + Kokkos::abort("isNan not available for std::complex!\n"); + return true; + } +#else + static bool isNan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST + using std::isnan; +#endif + return isnan(real(x)) || isnan(imag(x)); + } +#endif + static mag_type abs(const std::complex& x) { + return std::abs(x); + } + static std::complex zero() { + return std::complex(ArithTraits::zero(), + ArithTraits::zero()); + } + static std::complex one() { + return std::complex(ArithTraits::one(), + ArithTraits::zero()); + } + static std::complex min() { + return std::complex(ArithTraits::min(), + ArithTraits::zero()); + } + static std::complex max() { + return std::complex(ArithTraits::max(), + ArithTraits::zero()); + } + static mag_type real(const std::complex& x) { + return std::real(x); + } + static mag_type imag(const std::complex& x) { + return std::imag(x); + } + static std::complex conj( + const std::complex& x) { + return std::conj(x); + } + static std::complex pow(const std::complex& x, + const std::complex& y) { + // Fix for some weird gcc 4.2.1 inaccuracy. + if (y == one()) { + return x; + } else if (y == one() + one()) { + return x * x; + } else { + return std::pow(x, y); + } + } + static std::complex pow(const std::complex& x, + const RealFloatType& y) { + // Fix for some weird gcc 4.2.1 inaccuracy. + if (y == ArithTraits::one()) { + return x; + } else if (y == ArithTraits::one() + + ArithTraits::one()) { + return x * x; + } else { + return std::pow(x, y); + } + } + static std::complex sqrt( + const std::complex& x) { + return std::sqrt(x); + } + static std::complex cbrt( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif + } + static std::complex exp(const std::complex& x) { + return std::exp(x); + } + static std::complex log(const std::complex& x) { + return std::log(x); + } + static std::complex log10( + const std::complex& x) { + return std::log10(x); + } + static std::complex sin(const std::complex& x) { + return std::sin(x); + } + static std::complex cos(const std::complex& x) { + return std::cos(x); + } + static std::complex tan(const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::tan(x); +#else + return std::tan(x); +#endif + } + static std::complex sinh( + const std::complex& x) { + return std::sinh(x); + } + static std::complex cosh( + const std::complex& x) { + return std::cosh(x); + } + static std::complex tanh( + const std::complex& x) { + return std::tanh(x); + } + static std::complex asin( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::asin(x); +#else + return ::asin(x); +#endif + } + static std::complex acos( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::acos(x); +#else + return ::acos(x); +#endif + } + static std::complex atan( + const std::complex& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + using sycl::atan; +#else + using std::atan; +#endif + return atan(x); + } + static std::complex nan() { + const mag_type mag_nan = ArithTraits::nan(); + return std::complex(mag_nan, mag_nan); + } + static mag_type epsilon() { return ArithTraits::epsilon(); } + + // Backwards compatibility with Teuchos::ScalarTraits. + using magnitudeType = mag_type; + using halfPrecision = + std::complex::halfPrecision>; + using doublePrecision = + std::complex::doublePrecision>; + + static constexpr bool isComplex = true; + static constexpr bool isOrdinal = false; + static constexpr bool isComparable = false; + static constexpr bool hasMachineParameters = true; + static bool isnaninf(const std::complex& x) { + return isNan(x) || isInf(x); + } + static mag_type magnitude(const std::complex& x) { + return abs(x); + } + static std::complex conjugate( + const std::complex& x) { + return conj(x); + } + static std::string name() { + return std::string("std::complex<") + ArithTraits::name() + ">"; + } + static std::complex squareroot( + const std::complex& x) { + return sqrt(x); + } + static mag_type eps() { return epsilon(); } + static mag_type sfmin() { return ArithTraits::sfmin(); } + static int base() { return ArithTraits::base(); } + static mag_type prec() { return ArithTraits::prec(); } + static int t() { return ArithTraits::t(); } + static mag_type rnd() { return ArithTraits::one(); } + static int emin() { return ArithTraits::emin(); } + static mag_type rmin() { return ArithTraits::rmin(); } + static int emax() { return ArithTraits::emax(); } + static mag_type rmax() { return ArithTraits::rmax(); } +}; + +template <> +class ArithTraits { + public: + using val_type = char; + using mag_type = val_type; + + // The C(++) standard does not require that char be signed. In + // fact, signed char, unsigned char, and char are distinct types. + // We can use std::numeric_limits here because it's a const bool, + // not a class method. + static constexpr bool is_signed = std::numeric_limits::is_signed; + + static std::string name() { return "char"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = signed char; + using mag_type = val_type; + + static constexpr bool is_signed = true; + + static std::string name() { return "signed char"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = unsigned char; + using mag_type = val_type; + + static constexpr bool is_signed = false; + + static std::string name() { return "unsigned char"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = short; + using mag_type = val_type; + + static constexpr bool is_signed = true; + + static std::string name() { return "short"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = unsigned short; + using mag_type = val_type; + + static constexpr bool is_signed = false; + + static std::string name() { return "unsigned short"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = int; + using mag_type = val_type; + + static constexpr bool is_signed = true; + + static std::string name() { return "int"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = unsigned int; + using mag_type = val_type; + + static constexpr bool is_signed = false; + + static std::string name() { return "unsigned int"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = long; + using mag_type = val_type; + + static constexpr bool is_signed = true; + + static std::string name() { return "long"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = unsigned long; + using mag_type = val_type; + + static constexpr bool is_signed = false; + + static std::string name() { return "unsigned long"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = long long; + using mag_type = val_type; + + static constexpr bool is_signed = true; + + static std::string name() { return "long long"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +template <> +class ArithTraits { + public: + using val_type = unsigned long long; + using mag_type = val_type; + + static constexpr bool is_signed = false; + + static std::string name() { return "unsigned long long"; } + + KOKKOSKERNELS_ARITHTRAITS_INTEGRAL() +}; + +// dd_real and qd_real are floating-point types provided by the QD +// library of David Bailey (LBNL): +// +// http://crd-legacy.lbl.gov/~dhbailey/mpdist/ +// +// dd_real uses two doubles (128 bits), and qd_real uses four doubles +// (256 bits). +// +// Kokkos does not currently support these types in device +// functions. It should be possible to use Kokkos' support for +// aggregate types to implement device function support for dd_real +// and qd_real, but we have not done this yet (as of 09 Jan 2015). +// Hence, the class methods of the ArithTraits specializations for +// dd_real and qd_real are not marked as device functions. +#ifdef HAVE_KOKKOS_QD +// LBV: I would like to deprecate this strange optional +// dependency on the lbnl package, is there anyone actully +// using this? It certainly is never tested by CI or nightly +// so probably does not work... +template <> +struct [[deprecated]] ArithTraits { + typedef dd_real val_type; + typedef dd_real mag_type; + + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool is_complex = false; + + static inline bool isInf(const val_type& x) { return isinf(x); } + static inline bool isNan(const val_type& x) { return isnan(x); } + static inline mag_type abs(const val_type& x) { return ::abs(x); } + static inline val_type zero() { return val_type(0.0); } + static inline val_type one() { return val_type(1.0); } + static inline val_type min() { return std::numeric_limits::min(); } + static inline val_type max() { return std::numeric_limits::max(); } + static inline mag_type real(const val_type& x) { return x; } + static inline mag_type imag(const val_type&) { return zero(); } + static inline val_type conj(const val_type& x) { return x; } + static inline val_type pow(const val_type& x, const val_type& y) { + return ::pow(x, y); + } + static inline val_type sqrt(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif + } + static inline val_type cbrt(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif + } + static inline val_type exp(const val_type& x) { return ::exp(x); } + static inline val_type log(const val_type& x) { + // dd_real puts its transcendental functions in the global namespace. + return ::log(x); + } + static inline val_type log10(const val_type& x) { return ::log10(x); } + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } + static KOKKOS_FUNCTION val_type tan(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::tan(x); +#else + return std::tan(x); +#endif + } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } + static KOKKOS_FUNCTION val_type asin(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::asin(x); +#else + return ::asin(x); +#endif + } + static KOKKOS_FUNCTION val_type acos(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::acos(x); +#else + return ::acos(x); +#endif + } + static KOKKOS_FUNCTION val_type atan(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif + } + static inline val_type nan() { return val_type::_nan; } + static val_type epsilon() { return std::numeric_limits::epsilon(); } + + typedef dd_real magnitudeType; + typedef double halfPrecision; + typedef qd_real doublePrecision; + + static const bool isComplex = false; + static const bool isOrdinal = false; + static const bool isComparable = true; + static const bool hasMachineParameters = true; + + static mag_type eps() { return epsilon(); } + static mag_type sfmin() { return min(); } + static int base() { return std::numeric_limits::radix; } + static mag_type prec() { return eps() * base(); } + static int t() { return std::numeric_limits::digits; } + static mag_type rnd() { + return std::numeric_limits::round_style == std::round_to_nearest + ? one() + : zero(); + } + static int emin() { return std::numeric_limits::min_exponent; } + static mag_type rmin() { return std::numeric_limits::min(); } + static int emax() { return std::numeric_limits::max_exponent; } + static mag_type rmax() { return std::numeric_limits::max(); } + static mag_type magnitude(const val_type& x) { return ::abs(x); } + static val_type conjugate(const val_type& x) { return conj(x); } + static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } + static std::string name() { return "dd_real"; } + static val_type squareroot(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif + } +}; + +template <> +struct [[deprecated]] ArithTraits { + typedef qd_real val_type; + typedef qd_real mag_type; + + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool is_complex = false; + + static inline bool isInf(const val_type& x) { return isinf(x); } + static inline bool isNan(const val_type& x) { return isnan(x); } + static inline mag_type abs(const val_type& x) { return ::abs(x); } + static inline val_type zero() { return val_type(0.0); } + static inline val_type one() { return val_type(1.0); } + static inline val_type min() { return std::numeric_limits::min(); } + static inline val_type max() { return std::numeric_limits::max(); } + static inline mag_type real(const val_type& x) { return x; } + static inline mag_type imag(const val_type&) { return zero(); } + static inline val_type conj(const val_type& x) { return x; } + static inline val_type pow(const val_type& x, const val_type& y) { + return ::pow(x, y); + } + static inline val_type sqrt(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif + } + static inline val_type cbrt(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::cbrt(x); +#else + return ::cbrt(x); +#endif + } + static inline val_type exp(const val_type& x) { return ::exp(x); } + static inline val_type log(const val_type& x) { + // val_type puts its transcendental functions in the global namespace. + return ::log(x); + } + static inline val_type log10(const val_type& x) { return ::log10(x); } + static KOKKOS_FUNCTION val_type sin(const val_type x) { return ::sin(x); } + static KOKKOS_FUNCTION val_type cos(const val_type x) { return ::cos(x); } + static KOKKOS_FUNCTION val_type tan(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::tan(x); +#else + return std::tan(x); +#endif + } + static KOKKOS_FUNCTION val_type sinh(const val_type x) { return ::sinh(x); } + static KOKKOS_FUNCTION val_type cosh(const val_type x) { return ::cosh(x); } + static KOKKOS_FUNCTION val_type tanh(const val_type x) { return ::tanh(x); } + static KOKKOS_FUNCTION val_type asin(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::asin(x); +#else + return ::asin(x); +#endif + } + static KOKKOS_FUNCTION val_type acos(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::acos(x); +#else + return ::acos(x); +#endif + } + static KOKKOS_FUNCTION val_type atan(const val_type x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::atan(x); +#else + return ::atan(x); +#endif + } + static inline val_type nan() { return val_type::_nan; } + static inline val_type epsilon() { + return std::numeric_limits::epsilon(); + } + + typedef qd_real magnitudeType; + typedef dd_real halfPrecision; + // The QD library does not have an "oct-double real" class. One + // could use an arbitrary-precision library like MPFR or ARPREC, + // with the precision set appropriately, to get an + // extended-precision type for qd_real. + typedef qd_real doublePrecision; + + static const bool isComplex = false; + static const bool isOrdinal = false; + static const bool isComparable = true; + static const bool hasMachineParameters = true; + + static mag_type eps() { return epsilon(); } + static mag_type sfmin() { return min(); } + static int base() { return std::numeric_limits::radix; } + static mag_type prec() { return eps() * base(); } + static int t() { return std::numeric_limits::digits; } + static mag_type rnd() { + return std::numeric_limits::round_style == std::round_to_nearest + ? one() + : zero(); + } + static int emin() { return std::numeric_limits::min_exponent; } + static mag_type rmin() { return std::numeric_limits::min(); } + static int emax() { return std::numeric_limits::max_exponent; } + static mag_type rmax() { return std::numeric_limits::max(); } + static mag_type magnitude(const val_type& x) { return ::abs(x); } + static val_type conjugate(const val_type& x) { return conj(x); } + static bool isnaninf(const val_type& x) { return isNan(x) || isInf(x); } + static std::string name() { return "qd_real"; } + static val_type squareroot(const val_type& x) { +#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL + return sycl::sqrt(x); +#else + return ::sqrt(x); +#endif + } +}; +#endif // HAVE_KOKKOS_QD + +} // namespace Details + +// Promote ArithTraits into Kokkos namespace. At some point, we +// will remove it from the Details namespace completely. We leave +// it there for now, because a lot of code depends on it being +// there. +using Details::ArithTraits; +} // namespace Kokkos + +#endif // KOKKOS_ARITHTRAITS_HPP diff --git a/src/Kokkos_InnerProductSpaceTraits.hpp b/src/common/Kokkos_InnerProductSpaceTraits.hpp similarity index 100% rename from src/Kokkos_InnerProductSpaceTraits.hpp rename to src/common/Kokkos_InnerProductSpaceTraits.hpp diff --git a/src/graph/KokkosGraph_Distance1Color.hpp b/src/graph/KokkosGraph_Distance1Color.hpp index 3001ea660c..aca6414c83 100644 --- a/src/graph/KokkosGraph_Distance1Color.hpp +++ b/src/graph/KokkosGraph_Distance1Color.hpp @@ -44,8 +44,8 @@ #ifndef _KOKKOSGRAPH_DISTANCE1_COLOR_HPP #define _KOKKOSGRAPH_DISTANCE1_COLOR_HPP -#include "KokkosGraph_Distance1ColorHandle.hpp" -#include "KokkosGraph_Distance1Color_impl.hpp" +#include "KokkosGraph_color_d1_spec.hpp" +#include "KokkosKernels_helpers.hpp" #include "KokkosKernels_Utils.hpp" namespace KokkosGraph { @@ -59,81 +59,35 @@ void graph_color_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t /* num_cols */, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool /* is_symmetric */ = true) { - Kokkos::Timer timer; - - typename KernelHandle::GraphColoringHandleType *gch = - handle->get_graph_coloring_handle(); - - ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); - - typedef typename KernelHandle::GraphColoringHandleType::color_view_t - color_view_type; - - gch->set_tictoc(handle->get_verbose()); - - color_view_type colors_out; - if (gch->get_vertex_colors().use_count() > 0) { - colors_out = gch->get_vertex_colors(); - } else { - colors_out = color_view_type("Graph Colors", num_rows); - } - - typedef - typename Impl::GraphColor - BaseGraphColoring; - BaseGraphColoring *gc = NULL; - - switch (algorithm) { - case COLORING_SERIAL: - gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_VB: - case COLORING_VBBIT: - case COLORING_VBCS: - typedef typename Impl::GraphColor_VB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBGraphColoring; - gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_VBD: - case COLORING_VBDBIT: - typedef typename Impl::GraphColor_VBD< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - VBDGraphColoring; - gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_EB: - typedef typename Impl::GraphColor_EB< - typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, - lno_nnz_view_t_> - EBGraphColoring; - gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, - gch); - break; - - case COLORING_DEFAULT: break; - - default: break; - } - - int num_phases = 0; - gc->color_graph(colors_out, num_phases); - - delete gc; - double coloring_time = timer.seconds(); - gch->add_to_overall_coloring_time(coloring_time); - gch->set_coloring_time(coloring_time); - gch->set_num_phases(num_phases); - gch->set_vertex_colors(colors_out); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_entries; + KokkosGraph::Impl:: + COLOR_D1::color_d1( + &tmp_handle, num_rows, + Internal_rowmap(row_map.data(), row_map.extent(0)), + Internal_entries(entries.data(), entries.extent(0))); } template ( + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); } InternalRowmap rowmap_internal(row_map.data(), row_map.extent(0)); @@ -235,8 +235,8 @@ void bipartite_color_columns(KernelHandle *handle, TRowmap col_map("Col map", num_columns + 1); TEntries col_entries( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Col entries"), nnz); - KokkosKernels::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( num_rows, num_columns, row_map, row_entries, col_map, col_entries); // Get unmanaged views for both graph and its transpose InternalRowmap colmap_internal(col_map.data(), col_map.extent(0)); diff --git a/src/graph/KokkosGraph_ExplicitCoarsening.hpp b/src/graph/KokkosGraph_ExplicitCoarsening.hpp index 8992aa4bb8..322004c0b6 100644 --- a/src/graph/KokkosGraph_ExplicitCoarsening.hpp +++ b/src/graph/KokkosGraph_ExplicitCoarsening.hpp @@ -46,7 +46,7 @@ #define KOKKOSGRAPH_EXPLICIT_COARSEN_HPP #include "KokkosGraph_ExplicitCoarsening_impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" namespace KokkosGraph { namespace Experimental { @@ -86,8 +86,8 @@ void graph_explicit_coarsen( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::sort_and_merge_graph( + KokkosSparse::sort_and_merge_graph( coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; @@ -125,8 +125,8 @@ void graph_explicit_coarsen_with_inverse_map( if (compress) { coarse_rowmap_t mergedRowmap; coarse_entries_t mergedEntries; - KokkosKernels::sort_and_merge_graph( + KokkosSparse::sort_and_merge_graph( coarseRowmap, coarseEntries, mergedRowmap, mergedEntries); coarseRowmap = mergedRowmap; coarseEntries = mergedEntries; diff --git a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp index 39e27795cc..64873708b5 100644 --- a/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance1Color_impl.hpp @@ -411,7 +411,6 @@ class GraphColor_VB nnz_lno_t numUncolored = this->nv; - double t, total = 0.0; double total_time_greedy_phase = 0.0; double total_time_find_conflicts = 0.0; double total_time_serial_conflict_resolution = 0.0; @@ -435,8 +434,7 @@ class GraphColor_VB MyExecSpace().fence(); if (this->_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_greedy_phase += t; std::cout << "\tTime speculative greedy phase " << iter << " : " << t << std::endl; @@ -459,8 +457,7 @@ class GraphColor_VB MyExecSpace().fence(); if (_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_find_conflicts += t; std::cout << "\tTime conflict detection " << iter << " : " << t << std::endl; @@ -500,8 +497,7 @@ class GraphColor_VB } MyExecSpace().fence(); if (_ticToc) { - t = timer.seconds(); - total += t; + double t = timer.seconds(); total_time_serial_conflict_resolution += t; std::cout << "\tTime serial conflict resolution: " << t << std::endl; } @@ -3118,6 +3114,88 @@ class GraphColor_EB : public GraphColor +void graph_color_impl(KernelHandle *handle, + typename KernelHandle::nnz_lno_t num_rows, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries) { + Kokkos::Timer timer; + + typename KernelHandle::GraphColoringHandleType *gch = + handle->get_graph_coloring_handle(); + + ColoringAlgorithm algorithm = gch->get_coloring_algo_type(); + + typedef typename KernelHandle::GraphColoringHandleType::color_view_t + color_view_type; + + gch->set_tictoc(handle->get_verbose()); + + color_view_type colors_out; + if (gch->get_vertex_colors().use_count() > 0) { + colors_out = gch->get_vertex_colors(); + } else { + colors_out = color_view_type("Graph Colors", num_rows); + } + + typedef + typename Impl::GraphColor + BaseGraphColoring; + BaseGraphColoring *gc = NULL; + + switch (algorithm) { + case COLORING_SERIAL: + gc = new BaseGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_VB: + case COLORING_VBBIT: + case COLORING_VBCS: + typedef typename Impl::GraphColor_VB< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + VBGraphColoring; + gc = new VBGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_VBD: + case COLORING_VBDBIT: + typedef typename Impl::GraphColor_VBD< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + VBDGraphColoring; + gc = new VBDGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_EB: + typedef typename Impl::GraphColor_EB< + typename KernelHandle::GraphColoringHandleType, lno_row_view_t_, + lno_nnz_view_t_> + EBGraphColoring; + gc = new EBGraphColoring(num_rows, entries.extent(0), row_map, entries, + gch); + break; + + case COLORING_DEFAULT: break; + + default: break; + } + + int num_phases = 0; + gc->color_graph(colors_out, num_phases); + + delete gc; + double coloring_time = timer.seconds(); + gch->add_to_overall_coloring_time(coloring_time); + gch->set_coloring_time(coloring_time); + gch->set_num_phases(num_phases); + gch->set_vertex_colors(colors_out); +} + } // namespace Impl } // namespace KokkosGraph diff --git a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp index ed40646711..c8dddcefb8 100644 --- a/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2Color_impl.hpp @@ -51,7 +51,6 @@ #include #include -#include #include #include diff --git a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp index 1628b715a8..195d08dc0a 100644 --- a/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp +++ b/src/graph/impl/KokkosGraph_Distance2MIS_impl.hpp @@ -48,6 +48,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Bitset.hpp" #include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_Utils.hpp" #include namespace KokkosGraph { @@ -597,7 +598,7 @@ struct D2_MIS_FixedPriority { Kokkos::view_alloc(Kokkos::WithoutInitializing, "RowStatus"), numVerts); colStatus = status_view_t( Kokkos::view_alloc(Kokkos::WithoutInitializing, "ColStatus"), numVerts); - KokkosKernels::Impl::graph_min_max_degree( + KokkosSparse::Impl::graph_min_max_degree( rowmap, minDegree, maxDegree); // Compute row statuses Kokkos::parallel_for(range_pol(0, numVerts), diff --git a/src/graph/impl/KokkosGraph_color_d1_spec.hpp b/src/graph/impl/KokkosGraph_color_d1_spec.hpp new file mode 100644 index 0000000000..09366f2c4e --- /dev/null +++ b/src/graph/impl/KokkosGraph_color_d1_spec.hpp @@ -0,0 +1,153 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_COLOR_D1_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosGraph_Distance1Color_impl.hpp" +#endif + +namespace KokkosGraph { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct color_d1_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosGraph + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template <> \ + struct color_d1_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>> { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include + +namespace KokkosGraph { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosGraph::graph_color (distance-1 greedy +/// coloring) + +template ::value> +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, + typename lno_view_t::non_const_value_type num_rows, + size_view_t rowmap, lno_view_t entries); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct COLOR_D1 { + static void color_d1(KernelHandle *handle, + typename lno_view_t::non_const_value_type num_rows, + size_view_t rowmap, lno_view_t entries) { + KokkosGraph::Impl::graph_color_impl(handle, num_rows, rowmap, entries); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosGraph + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + extern template struct COLOR_D1< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template struct COLOR_D1< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + false, true>; + +#include + +#endif diff --git a/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..eb5d74232e --- /dev/null +++ b/src/impl/generated_specializations_cpp/bspgemm_numeric/KokkosSparse_bspgemm_numeric_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_bspgemm_numeric_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_BSPGEMM_NUMERIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos \ No newline at end of file diff --git a/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..c4e4c8efe6 --- /dev/null +++ b/src/impl/generated_specializations_cpp/color_d1/KokkosGraph_color_d1_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosGraph_color_d1_spec.hpp" +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..1ffa61b1d5 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spadd_numeric/KokkosSparse_spadd_numeric_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spadd_numeric_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..aa08a1c6c7 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spadd_symbolic/KokkosSparse_spadd_symbolic_eti_spec_inst.cpp.in @@ -0,0 +1,53 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" + +#include "KokkosSparse_spadd_symbolic_spec.hpp" +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_INST_BLOCK@ + } //IMPL +} //Kokkos diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..daff73b371 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..8e8ca17113 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosGraph_color_d1_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +#define KOKKOSGRAPH_COLOR_D1_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosGraph { +namespace Impl { +@GRAPH_COLOR_D1_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..7159192433 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_BSPGEMM_NUMERIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..5d63c640d6 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_bspgemm_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_BSPGEMM_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif \ No newline at end of file diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..b47c423974 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..fd971bc314 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_numeric_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_NUMERIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..b38552c34a --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_avail.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_AVAIL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..ea001cb72b --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spadd_symbolic_eti_spec_decl.hpp.in @@ -0,0 +1,51 @@ +#ifndef KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL_HPP_ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { +namespace Impl { +@SPARSE_SPADD_SYMBOLIC_ETI_DECL_BLOCK@ + } //IMPL +} //Kokkos +#endif diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 33ee439316..2d67c95c3e 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -613,9 +613,12 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_dgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -657,9 +660,12 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ rocblas_sgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, \ X.data(), one, &beta, Y.data(), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -702,6 +708,8 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_zgemv( \ s.handle, transa, M, N, \ reinterpret_cast(&alpha), \ @@ -709,6 +717,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(&beta), \ reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -751,6 +760,8 @@ namespace Impl { KOKKOSBLAS2_GEMV_ROCBLAS_DETERMINE_ARGS(LAYOUT); \ KokkosBlas::Impl::RocBlasSingleton& s = \ KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_cgemv( \ s.handle, transa, M, N, \ reinterpret_cast(&alpha), \ @@ -758,6 +769,7 @@ namespace Impl { reinterpret_cast(X.data()), one, \ reinterpret_cast(&beta), \ reinterpret_cast(Y.data()), one)); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ Kokkos::Profiling::popRegion(); \ } \ }; diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp index 974fe76eb0..c025a1a11e 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_avail.hpp @@ -58,7 +58,7 @@ struct trtri_tpl_spec_avail { #define KOKKOSBLAS_TRTRI_TPL_SPEC_AVAIL(SCALAR, LAYOUTA, MEMSPACE) \ template \ struct trtri_tpl_spec_avail< \ - Kokkos::View >, \ Kokkos::View, \ Kokkos::MemoryTraits > > { \ diff --git a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp index af9f843938..af6c186039 100644 --- a/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas_trtri_tpl_spec_decl.hpp @@ -55,14 +55,14 @@ namespace Impl { #define KOKKOSBLAS_TRTRI_BLAS_HOST(SCALAR_TYPE, BASE_SCALAR_TYPE, LAYOUTA, \ MEM_SPACE, ETI_SPEC_AVAIL) \ template \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View \ - struct TRTRI >, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ typedef SCALAR_TYPE SCALAR; \ - typedef Kokkos::View > \ RViewType; \ typedef Kokkos::View +struct spadd_symbolic_tpl_spec_avail { + enum : bool { value = false }; +}; + +template +struct spadd_numeric_tpl_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp new file mode 100644 index 0000000000..d9f6a19911 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp @@ -0,0 +1,52 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl {} +} // namespace KokkosSparse + +#endif diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index cd8287b38e..57170d6eb6 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -195,93 +195,49 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. - -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ - SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits, true> { \ - enum : bool { value = true }; \ +// cuSparse level 3 does not currently support LayoutRight +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, false> { \ + enum : bool { value = true }; \ }; #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // CUDA/CUSPARSE >= 9.0? diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index a1ae213ea9..93457f9837 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,10 +42,11 @@ //@HEADER */ -#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP -#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_Controls.hpp" +#include "KokkosSparse_Utils_mkl.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include @@ -57,26 +58,7 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -namespace BSR { -inline void mkl_safe_call(int errcode) { - if (errcode != SPARSE_STATUS_SUCCESS) - throw std::runtime_error("MKL returned non-success error code"); -} - -inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; - case 'T': return SPARSE_OPERATION_TRANSPOSE; - case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} -} // namespace BSR - -using BSR::mkl_safe_call; -using BSR::mode_kk_to_mkl; +using KokkosSparse::Impl::mode_kk_to_mkl; inline matrix_descr getDescription() { matrix_descr A_descr; @@ -91,13 +73,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, const int* Aentries, const float* Avalues, const float* x, float* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_s_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, @@ -106,13 +89,14 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, double* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_d_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_block_impl_mkl(sparse_operation_t op, @@ -123,17 +107,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_c_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); - matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; + matrix_descr A_descr = getDescription(); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spmv_block_impl_mkl(sparse_operation_t op, @@ -144,17 +128,17 @@ inline void spmv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, Kokkos::complex* y) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_z_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - matrix_descr A_descr = getDescription(); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + matrix_descr A_descr = getDescription(); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, @@ -163,15 +147,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, const float* Avalues, const float* x, int colx, int ldx, float* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_s_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, - ldy)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, + ldx, beta, y, ldy)); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, @@ -180,15 +164,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, const double* Avalues, const double* x, int colx, int ldx, double* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_d_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); matrix_descr A_descr = getDescription(); - mkl_safe_call(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, - SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, - ldy)); + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, + ldx, beta, y, ldy)); } inline void spm_mv_block_impl_mkl(sparse_operation_t op, @@ -200,15 +184,15 @@ inline void spm_mv_block_impl_mkl(sparse_operation_t op, const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_c_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); - matrix_descr A_descr = getDescription(); - mkl_safe_call( + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; + matrix_descr A_descr = getDescription(); + KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); @@ -221,15 +205,15 @@ inline void spm_mv_block_impl_mkl( const Kokkos::complex* x, int colx, int ldx, Kokkos::complex* y, int ldy) { sparse_matrix_t A_mkl; - mkl_safe_call(mkl_sparse_z_create_bsr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_bsr( &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - matrix_descr A_descr = getDescription(); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call( + matrix_descr A_descr = getDescription(); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; + KOKKOSKERNELS_MKL_SAFE_CALL( mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, reinterpret_cast(x), colx, ldx, beta_mkl, reinterpret_cast(y), ldy)); @@ -470,7 +454,7 @@ KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" // // From https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv @@ -503,7 +487,7 @@ void spmv_block_impl_cusparse( default: { std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; throw std::invalid_argument("Invalid mode"); - } break; + } } #if (9000 <= CUDA_VERSION) @@ -578,8 +562,24 @@ void spmv_block_impl_cusparse( // - Only blockDim > 1 is supported // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. +// - Only LayoutLeft for X and Y: +// for X,Y LayoutLeft we want cuSparse to do +// C = A * B + C +// and for X,Y LayoutRight we want cuSparse to do +// trans(C) = A * trans(B) + trans(C) +// -> t(t(C)) = t(A * t(B)) + t(t(C)) +// -> C = t(t(B)) * t(A) + C +// -> C = B * t(A) + C +// This is impossible in cuSparse without explicitly transposing C, +// so we just do not support LayoutRight in cuSparse TPL now // -template +template < + class AMatrix, class XVector, class YVector, + std::enable_if_t::value && + std::is_same::value, + bool> = true> void spm_mv_block_impl_cusparse( const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, @@ -599,12 +599,14 @@ void spm_mv_block_impl_cusparse( default: { std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; throw std::invalid_argument("Invalid mode"); - } break; + } } int colx = static_cast(x.extent(1)); - int ldx = static_cast(x.stride_1()); - int ldy = static_cast(y.stride_1()); + + // ldx and ldy should be the leading dimension of X,Y respectively + const int ldx = static_cast(x.extent(0)); + const int ldy = static_cast(y.extent(0)); #if (9000 <= CUDA_VERSION) @@ -761,29 +763,31 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_CUSPARSE -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +// cuSparse TPL does not support LayoutRight for this operation +// only specialize for LayoutLeft +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \ + ETI_AVAIL) \ template <> \ struct SPMV_MV_BSRMATRIX< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ - LAYOUT, Kokkos::Device, \ + Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + SCALAR**, Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, false, true, ETI_AVAIL> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using AMatrix = BsrMatrix; \ using XVector = Kokkos::View< \ - SCALAR const**, LAYOUT, device_type, \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ + using YVector = Kokkos::View; \ using Controls = KokkosKernels::Experimental::Controls; \ \ using coefficient_type = typename YVector::non_const_value_type; \ @@ -802,55 +806,32 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, }; #if (9000 <= CUDA_VERSION) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif + Kokkos::CudaUVMSpace, false) + +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE @@ -858,6 +839,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp new file mode 100644 index 0000000000..ef23f6ec9a --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_avail.hpp @@ -0,0 +1,175 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +// Specialization struct which defines whether a specialization exists +template ::type>::value> +struct spmv_mv_tpl_spec_avail { + enum : bool { value = false }; +}; + +#define KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(SCALAR, ORDINAL, OFFSET, \ + XL, YL, MEMSPACE) \ + template <> \ + struct spmv_mv_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, YL, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +/* CUSPARSE_VERSION 10300 and lower seem to have a bug in cusparseSpMM +non-transpose that produces incorrect result. This is cusparse distributed with +CUDA 10.1.243. The bug seems to be resolved by CUSPARSE 10301 (present by +CUDA 10.2.89) */ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) + +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::Experimental::half_t, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) + +#endif +#endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSPARSE_SPMV_MV_TPL_SPEC_AVAIL_HPP_ diff --git a/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp new file mode 100644 index 0000000000..0bfeec3288 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_mv_tpl_spec_decl.hpp @@ -0,0 +1,336 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ + +#include "KokkosKernels_Controls.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +/* CUSPARSE_VERSION < 10301 either doesn't have cusparseSpMM + or the non-tranpose version produces incorrect results. +*/ +#if defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#include "cusparse.h" +#include "KokkosSparse_Utils_cusparse.hpp" + +namespace KokkosSparse { +namespace Impl { + +/* Derive a compute type for various operand types. + cusparseSpMM does not always allow the same compute type as operand types + This should be consistent with the allowed operand types for cusparseSpMM, + as needed for TPL availability. Current definition does not comprehensively + cover all cusparseSpMM options. + + cuSparse 11.5.1+ does not support uniform precision for FP16 + Otherwise, uniform precision is supported +*/ +template +cudaDataType compute_type() { + return cuda_data_type_from(); +} +#if CUSPARSE_VERSION >= 11501 +template <> +inline cudaDataType compute_type() { + return CUDA_R_32F; +} +#else +template <> +inline cudaDataType compute_type() { + return cuda_data_type_from(); +} +#endif + +/*! \brief convert a 2D view to a cusparseDnMatDescr_t + +*/ +template = true> +cusparseDnMatDescr_t make_cusparse_dn_mat_descr_t(ViewType &view) { + const int64_t rows = view.extent(0); + const int64_t cols = view.extent(1); + const int64_t ld = view.extent(0); + + // cusparseCreateCsr notes it is safe to const_cast this away for input + // pointers to a descriptor as long as that descriptor is not an output + // parameter + void *values = + const_cast(view.data()); + + cudaDataType valueType = + cuda_data_type_from(); + + // col-major is the only supported order in 10301 + // ignore the layout of the provided view, and expect the caller to + // fix with a transpose operation, if possible. + // This should be revisited once cusparse supports row-major dense matrices + const cusparseOrder_t order = CUSPARSE_ORDER_COL; + + cusparseDnMatDescr_t descr; + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseCreateDnMat(&descr, rows, cols, ld, values, valueType, order)); + + return descr; +} + +template +void spmv_mv_cusparse(const KokkosKernels::Experimental::Controls &controls, + const char mode[], + typename YVector::non_const_value_type const &alpha, + const AMatrix &A, const XVector &x, + typename YVector::non_const_value_type const &beta, + const YVector &y) { + static_assert(XVector::rank == 2, + "should only be instantiated for multivector"); + static_assert(YVector::rank == 2, + "should only be instantiated for multivector"); + + using offset_type = typename AMatrix::non_const_size_type; + using entry_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using x_value_type = typename XVector::non_const_value_type; + using y_value_type = typename YVector::non_const_value_type; + + /* initialize cusparse library */ + cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + + /* Set the operation mode */ + cusparseOperation_t opA; + switch (toupper(mode[0])) { + case 'N': opA = CUSPARSE_OPERATION_NON_TRANSPOSE; break; + case 'T': opA = CUSPARSE_OPERATION_TRANSPOSE; break; + case 'H': opA = CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE; break; + default: { + std::cerr << "Mode " << mode << " invalid for cuSPARSE SpMV MV.\n"; + throw std::invalid_argument("Invalid mode"); + } + } + + /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + const cudaDataType aCusparseType = cuda_data_type_from(); + + /* create matrix */ + cusparseSpMatDescr_t A_cusparse; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateCsr( + &A_cusparse, A.numRows(), A.numCols(), A.nnz(), + (void *)A.graph.row_map.data(), (void *)A.graph.entries.data(), + (void *)A.values.data(), myCusparseOffsetType, myCusparseEntryType, + CUSPARSE_INDEX_BASE_ZERO, aCusparseType)); + + /* create lhs and rhs + NOTE: The descriptions always say vecX and vecY are column-major cusparse + order. For CUSPARSE_VERSION 10301 this is the only supported ordering. if X + is not LayoutLeft, we can fix with a transpose. If cusparseSpMM ever + supports row-major dense matrices, this logic will have to be reworked */ + constexpr bool xIsLL = + std::is_same::value; + constexpr bool xIsLR = + std::is_same::value; + static_assert(xIsLL || xIsLR, "X multivector was not LL or LR (TPL error)"); + cusparseDnMatDescr_t vecX = make_cusparse_dn_mat_descr_t(x); + cusparseDnMatDescr_t vecY = make_cusparse_dn_mat_descr_t(y); + cusparseOperation_t opB = + xIsLL ? CUSPARSE_OPERATION_NON_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; + + const cusparseSpMMAlg_t alg = CUSPARSE_MM_ALG_DEFAULT; + + // the precision of the SpMV + const cudaDataType computeType = + compute_type(); + + size_t bufferSize = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM_bufferSize( + cusparseHandle, opA, opB, &alpha, A_cusparse, vecX, &beta, vecY, + computeType, alg, &bufferSize)); + + void *dBuffer = nullptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&dBuffer, bufferSize)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMM(cusparseHandle, opA, opB, &alpha, + A_cusparse, vecX, &beta, vecY, + computeType, alg, dBuffer)); + + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(dBuffer)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecX)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyDnMat(vecY)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroySpMat(A_cusparse)); +} + +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, XL, YL, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const **, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR **, YL, Kokkos::Device, \ + Kokkos::MemoryTraits, false, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = CrsMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const **, XL, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + using Controls = KokkosKernels::Experimental::Controls; \ + static void spmv_mv(const Controls &controls, const char mode[], \ + const coefficient_type &alpha, const AMatrix &A, \ + const XVector &x, const coefficient_type &beta, \ + const YVector &y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_mv_cusparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +/* cusparseSpMM with following restrictions + column-major ordering for Y + col-major or row-major for X (see note below) + 32-bit indices for matrix A */ +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutLeft, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::Experimental::half_t, int, int, + Kokkos::LayoutRight, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + +#endif + +#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE + +} // namespace Impl +} // namespace KokkosSparse +#endif // defined(CUSPARSE_VERSION) && (10301 <= CUSPARSE_VERSION) +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#endif // KOKKOSPARSE_SPMV_MV_TPL_SPEC_DECL_HPP_ \ No newline at end of file diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp index fd42797d71..a91996361b 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_avail.hpp @@ -201,6 +201,8 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int64_t, #endif // CUDA/CUSPARSE >= 9.0? #endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#undef KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_CUSPARSE + #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #define KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_ROCSPARSE(SCALAR, LAYOUT) \ @@ -265,15 +267,6 @@ KOKKOSSPARSE_SPMV_TPL_SPEC_AVAIL_MKL(Kokkos::complex, Kokkos::OpenMP) #endif // KOKKOSKERNELS_ENABLE_TPL_MKL -// Specialization struct which defines whether a specialization exists -template ::type>::value> -struct spmv_mv_tpl_spec_avail { - enum : bool { value = false }; -}; - } // namespace Impl } // namespace KokkosSparse diff --git a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp index 17a72b2ad3..b4c73a12ff 100644 --- a/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_tpl_spec_decl.hpp @@ -50,7 +50,7 @@ // cuSPARSE #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE #include "cusparse.h" -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" namespace KokkosSparse { namespace Impl { @@ -86,25 +86,11 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, #if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) /* Check that cusparse can handle the types of the input Kokkos::CrsMatrix */ - cusparseIndexType_t myCusparseOffsetType; - if (std::is_same::value) - myCusparseOffsetType = CUSPARSE_INDEX_32I; - else if (std::is_same::value || - std::is_same::value) - myCusparseOffsetType = CUSPARSE_INDEX_64I; - else - throw std::logic_error( - "Offset type of CrsMatrix isn't supported by cuSPARSE, yet TPL layer " - "says it is"); - cusparseIndexType_t myCusparseEntryType; - if (std::is_same::value) - myCusparseEntryType = CUSPARSE_INDEX_32I; - else if (std::is_same::value) - myCusparseEntryType = CUSPARSE_INDEX_64I; - else - throw std::logic_error( - "Ordinal (entry) type of CrsMatrix isn't supported by cuSPARSE, yet " - "TPL layer says it is"); + const cusparseIndexType_t myCusparseOffsetType = + cusparse_index_type_t_from(); + const cusparseIndexType_t myCusparseEntryType = + cusparse_index_type_t_from(); + cudaDataType myCudaDataType; if (std::is_same::value) myCudaDataType = CUDA_R_32F; @@ -134,15 +120,27 @@ void spmv_cusparse(const KokkosKernels::Experimental::Controls& controls, KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateDnVec( &vecY, y.extent_int(0), (void*)y.data(), myCudaDataType)); - size_t bufferSize = 0; - void* dBuffer = NULL; + size_t bufferSize = 0; + void* dBuffer = NULL; +#if CUSPARSE_VERSION >= 11201 + cusparseSpMVAlg_t alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else cusparseSpMVAlg_t alg = CUSPARSE_MV_ALG_DEFAULT; +#endif if (controls.isParameter("algorithm")) { const std::string algName = controls.getParameter("algorithm"); if (algName == "default") +#if CUSPARSE_VERSION >= 11201 + alg = CUSPARSE_SPMV_ALG_DEFAULT; +#else alg = CUSPARSE_MV_ALG_DEFAULT; +#endif else if (algName == "merge") +#if CUSPARSE_VERSION >= 11201 + alg = CUSPARSE_SPMV_CSR_ALG2; +#else alg = CUSPARSE_CSRMV_ALG2; +#endif } KOKKOS_CUSPARSE_SAFE_CALL(cusparseSpMV_bufferSize( cusparseHandle, myCusparseOperation, &alpha, A_cusparse, vecX, &beta, @@ -361,8 +359,8 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif -#endif +#endif // defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_CUSPARSE @@ -373,7 +371,7 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int64_t, size_t, // rocSPARSE #if defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) #include -#include "KokkosKernels_SparseUtils_rocsparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" namespace KokkosSparse { namespace Impl { @@ -530,6 +528,7 @@ KOKKOSSPARSE_SPMV_ROCSPARSE(Kokkos::complex, Kokkos::LayoutRight, #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include +#include "KokkosSparse_Utils_mkl.hpp" namespace KokkosSparse { namespace Impl { @@ -537,27 +536,6 @@ namespace Impl { #if (__INTEL_MKL__ > 2017) // MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() -// Note 12/03/21 - lbv: -// mkl_safe_call and mode_kk_to_mkl should -// be moved to some sparse or mkl utility -// header. It is likely that these will be -// reused for other kernels. -inline void mkl_safe_call(int errcode) { - if (errcode != SPARSE_STATUS_SUCCESS) - throw std::runtime_error("MKL returned non-success error code"); -} - -inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { - switch (toupper(mode_kk)) { - case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; - case 'T': return SPARSE_OPERATION_TRANSPOSE; - case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; - default:; - } - throw std::invalid_argument( - "Invalid mode for MKL (should be one of N, T, H)"); -} - inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, int n, const int* Arowptrs, const int* Aentries, const float* Avalues, const float* x, float* y) { @@ -566,11 +544,12 @@ inline void spmv_mkl(sparse_operation_t op, float alpha, float beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_s_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, @@ -581,11 +560,12 @@ inline void spmv_mkl(sparse_operation_t op, double alpha, double beta, int m, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_d_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), const_cast(Avalues))); - mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, @@ -599,15 +579,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_c_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex8*)Avalues)); - MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex8& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + MKL_Complex8 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex8 beta_mkl{beta.real(), beta.imag()}; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, @@ -621,15 +601,15 @@ inline void spmv_mkl(sparse_operation_t op, Kokkos::complex alpha, A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; A_descr.mode = SPARSE_FILL_MODE_FULL; A_descr.diag = SPARSE_DIAG_NON_UNIT; - mkl_safe_call(mkl_sparse_z_create_csr( + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( &A_mkl, SPARSE_INDEX_BASE_ZERO, m, n, const_cast(Arowptrs), const_cast(Arowptrs + 1), const_cast(Aentries), (MKL_Complex16*)Avalues)); - MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); - MKL_Complex16& beta_mkl = reinterpret_cast(beta); - mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, - reinterpret_cast(x), - beta_mkl, reinterpret_cast(y))); + MKL_Complex16 alpha_mkl{alpha.real(), alpha.imag()}; + MKL_Complex16 beta_mkl{beta.real(), beta.imag()}; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_mv( + op, alpha_mkl, A_mkl, A_descr, reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); } #define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ diff --git a/src/common/KokkosKernels_Controls.hpp b/src/sparse/KokkosKernels_Controls.hpp similarity index 88% rename from src/common/KokkosKernels_Controls.hpp rename to src/sparse/KokkosKernels_Controls.hpp index c5a47a24b3..aabe0069be 100644 --- a/src/common/KokkosKernels_Controls.hpp +++ b/src/sparse/KokkosKernels_Controls.hpp @@ -81,28 +81,23 @@ class Controls { // check if a parameter is already set bool isParameter(const std::string& name) const { - bool return_value = false; - - auto search = kernel_parameters.find(name); - if (search != kernel_parameters.end()) { - return_value = true; - } - - return return_value; + return kernel_parameters.end() != kernel_parameters.find(name); } - // retrieve the value associated with a parameter if it is already set - std::string getParameter(const std::string& name) const { + /// \brief get the value associated with \c name, or \c default if not present + /// + /// \param name the name of the parameter to retrieve + /// \param orUnset (default \c "" ) the value to return if \c name is not set + std::string getParameter(const std::string& name, + const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); - std::string value; - if (search == kernel_parameters.end()) { - std::cout << "Parameter " << name - << " was not found in the list of parameters!" << std::endl; - value = ""; + if (kernel_parameters.end() == search) { + std::cerr << "WARNING: Controls::getParameter for name \"" << name + << "\" was unset" << std::endl; + return orUnset; } else { - value = search->second; + return search->second; } - return value; } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/src/common/KokkosKernels_Handle.hpp b/src/sparse/KokkosKernels_Handle.hpp similarity index 99% rename from src/common/KokkosKernels_Handle.hpp rename to src/sparse/KokkosKernels_Handle.hpp index 0e9ba8dc4e..69a74c3e5d 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/sparse/KokkosKernels_Handle.hpp @@ -181,6 +181,7 @@ class KokkosKernelsHandle { this->gs_sptrsvUHandle = right_side_handle.get_gs_sptrsvU_handle(); this->spgemmHandle = right_side_handle.get_spgemm_handle(); + this->spaddHandle = right_side_handle.get_spadd_handle(); this->sptrsvHandle = right_side_handle.get_sptrsv_handle(); this->spilukHandle = right_side_handle.get_spiluk_handle(); diff --git a/src/sparse/KokkosSparse_IOUtils.hpp b/src/sparse/KokkosSparse_IOUtils.hpp new file mode 100644 index 0000000000..fa6d08f960 --- /dev/null +++ b/src/sparse/KokkosSparse_IOUtils.hpp @@ -0,0 +1,1274 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSSPARSE_IOUTILS_HPP +#define _KOKKOSSPARSE_IOUTILS_HPP + +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +namespace KokkosSparse { +namespace Impl { + +// MD: Bases on Christian's sparseMatrix_generate function in test_crsmatrix.cpp +// file. +template +void kk_sparseMatrix_generate(OrdinalType nrows, OrdinalType ncols, + SizeType &nnz, OrdinalType row_size_variance, + OrdinalType bandwidth, ScalarType *&values, + SizeType *&rowPtr, OrdinalType *&colInd, + OrdinalType block_elem_count = 1) { + rowPtr = new SizeType[nrows + 1]; + + OrdinalType elements_per_row = nrows ? nnz / nrows : 0; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + int numRowEntries = elements_per_row + varianz; + if (numRowEntries < 0) numRowEntries = 0; + // Clamping numRowEntries above accomplishes 2 things: + // - If ncols is 0, numRowEntries will also be 0 + // - With numRowEntries at most 2/3 the number of columns, in the worst + // case + // 90% of insertions will succeed after 6 tries + if (numRowEntries > 0.66 * ncols) numRowEntries = 0.66 * ncols; + rowPtr[row + 1] = rowPtr[row] + numRowEntries; + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; ++k) { + while (true) { + OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + while (pos < 0) pos += ncols; + while (pos >= ncols) pos -= ncols; + + bool is_already_in_the_row = false; + for (SizeType j = rowPtr[row]; j < k; j++) { + if (colInd[j] == pos) { + is_already_in_the_row = true; + break; + } + } + if (!is_already_in_the_row) { + colInd[k] = pos; + break; + } + } + } + } + // Sample each value from uniform (-50, 50) for real types, or (-50 - 50i, 50 + // + 50i) for complex types. + Kokkos::View valuesView( + values, nnz * block_elem_count); + ScalarType randStart, randEnd; + KokkosKernels::Impl::getRandomBounds(50.0, randStart, randEnd); + Kokkos::Random_XorShift64_Pool pool(13718); + Kokkos::fill_random(valuesView, pool, randStart, randEnd); +} + +template +void kk_sparseMatrix_generate_lower_upper_triangle( + char uplo, OrdinalType nrows, OrdinalType ncols, SizeType &nnz, + OrdinalType /*row_size_variance*/, OrdinalType /*bandwidth*/, + ScalarType *&values, SizeType *&rowPtr, OrdinalType *&colInd) { + rowPtr = new SizeType[nrows + 1]; + + // OrdinalType elements_per_row = nnz/nrows; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + if (uplo == 'L') + rowPtr[row + 1] = rowPtr[row] + row + 1; + else + rowPtr[row + 1] = rowPtr[row] + ncols - (row); + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1]; k++) { + if (uplo == 'L') + colInd[k] = k - rowPtr[row]; + else + colInd[k] = row + (k - rowPtr[row]); + values[k] = 1.0; + } + } +} + +template +void kk_diagonally_dominant_sparseMatrix_generate( + OrdinalType nrows, OrdinalType ncols, SizeType &nnz, + OrdinalType row_size_variance, OrdinalType bandwidth, ScalarType *&values, + SizeType *&rowPtr, OrdinalType *&colInd, + ScalarType diagDominance = 10 * Kokkos::ArithTraits::one()) { + rowPtr = new SizeType[nrows + 1]; + + OrdinalType elements_per_row = nnz / nrows; + srand(13721); + rowPtr[0] = 0; + for (int row = 0; row < nrows; row++) { + int varianz = (1.0 * rand() / RAND_MAX - 0.5) * row_size_variance; + if (varianz < 1) varianz = 1; + if (varianz > 0.75 * ncols) varianz = 0.75 * ncols; + rowPtr[row + 1] = rowPtr[row] + elements_per_row + varianz; + if (rowPtr[row + 1] <= rowPtr[row]) // This makes sure that there is + rowPtr[row + 1] = rowPtr[row] + 1; // at least one nonzero in the row + } + nnz = rowPtr[nrows]; + values = new ScalarType[nnz]; + colInd = new OrdinalType[nnz]; + for (OrdinalType row = 0; row < nrows; row++) { + ScalarType total_values = 0; + std::unordered_set entriesInRow; + // We always add the diagonal entry (after this loop) + entriesInRow.insert(row); + for (SizeType k = rowPtr[row]; k < rowPtr[row + 1] - 1; k++) { + while (true) { + OrdinalType pos = (1.0 * rand() / RAND_MAX - 0.5) * bandwidth + row; + while (pos < 0) pos += ncols; + while (pos >= ncols) pos -= ncols; + + if (entriesInRow.find(pos) == entriesInRow.end()) { + entriesInRow.insert(pos); + colInd[k] = pos; + values[k] = 100.0 * rand() / RAND_MAX - 50.0; + total_values += + Kokkos::Details::ArithTraits::abs(values[k]); + break; + } + } + } + + colInd[rowPtr[row + 1] - 1] = row; + values[rowPtr[row + 1] - 1] = total_values * diagDominance; + } +} + +// This function creates a diagonal sparse matrix for testing matrix operations. +// The elements on the diagonal are 1, 2, ..., n-1, n. +// If "invert" is true, it will return the inverse of the above diagonal matrix. +template +crsMat_t kk_generate_diag_matrix(typename crsMat_t::const_ordinal_type n, + const bool invert = false) { + typedef typename crsMat_t::ordinal_type ot; + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + + row_map_view_t rowmap_view("rowmap_view", n + 1); + cols_view_t columns_view("colsmap_view", n); + values_view_t values_view("values_view", n); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= n; ++i) { + hr(i) = size_type(i); + } + + for (ot i = 0; i < n; ++i) { + hc(i) = lno_t(i); + if (invert) { + hv(i) = scalar_t(1.0) / (scalar_t(i + 1)); + } else { + hv(i) = scalar_t(i + 1); + } + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", n, values_view, static_graph); + return crsmat; +} + +template +crsMat_t kk_generate_diagonally_dominant_sparse_matrix( + typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth, + typename crsMat_t::const_value_type diagDominance = + 10 * Kokkos::ArithTraits::one()) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_diagonally_dominant_sparseMatrix_generate( + nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj, + diagDominance); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsMat_t kk_generate_triangular_sparse_matrix( + char uplo, typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_sparseMatrix_generate_lower_upper_triangle( + uplo, nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + Kokkos::fence(); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsMat_t kk_generate_sparse_matrix( + typename crsMat_t::const_ordinal_type nrows, + typename crsMat_t::const_ordinal_type ncols, + typename crsMat_t::non_const_size_type &nnz, + typename crsMat_t::const_ordinal_type row_size_variance, + typename crsMat_t::const_ordinal_type bandwidth) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::non_const_value_type size_type; + typedef typename cols_view_t::non_const_value_type lno_t; + typedef typename values_view_t::non_const_value_type scalar_t; + lno_t *adj; + size_type *xadj; //, nnzA; + scalar_t *values; + + kk_sparseMatrix_generate( + nrows, ncols, nnz, row_size_variance, bandwidth, values, xadj, adj); + + row_map_view_t rowmap_view("rowmap_view", nrows + 1); + cols_view_t columns_view("colsmap_view", nnz); + values_view_t values_view("values_view", nnz); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + typename values_view_t::HostMirror hv = + Kokkos::create_mirror_view(values_view); + + for (lno_t i = 0; i <= nrows; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnz; ++i) { + hc(i) = adj[i]; + hv(i) = values[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +bsrMat_t kk_generate_sparse_matrix( + typename bsrMat_t::const_ordinal_type block_dim, + typename bsrMat_t::const_ordinal_type nrows, + typename bsrMat_t::const_ordinal_type ncols, + typename bsrMat_t::non_const_size_type &nnz, + typename bsrMat_t::const_ordinal_type row_size_variance, + typename bsrMat_t::const_ordinal_type bandwidth) { + typedef KokkosSparse::CrsMatrix< + typename bsrMat_t::value_type, typename bsrMat_t::ordinal_type, + typename bsrMat_t::device_type, typename bsrMat_t::memory_traits, + typename bsrMat_t::size_type> + crsMat_t; + + const auto crs_mtx = kk_generate_sparse_matrix( + nrows * block_dim, ncols * block_dim, nnz, row_size_variance, bandwidth); + bsrMat_t bsrmat(crs_mtx, block_dim); + return bsrmat; +} +// TODO: need to fix the size_type. All over the reading inputs are lno_t. + +template +void convert_crs_to_lower_triangle_edge_list(idx nv, idx *xadj, idx *adj, + idx *lower_triangle_srcs, + idx *lower_triangle_dests) { + idx ind = 0; + for (idx i = 0; i < nv; ++i) { + idx xb = xadj[i]; + idx xe = xadj[i + 1]; + for (idx j = xb; j < xe; ++j) { + idx dst = adj[j]; + if (i < dst) { + lower_triangle_srcs[ind] = i; + lower_triangle_dests[ind++] = dst; + } + } + } +} + +template +void convert_crs_to_edge_list(idx nv, idx *xadj, idx *srcs) { + for (idx i = 0; i < nv; ++i) { + idx xb = xadj[i]; + idx xe = xadj[i + 1]; + for (idx j = xb; j < xe; ++j) { + srcs[j] = i; + } + } +} + +template +void convert_edge_list_to_csr(lno_t nv, size_type ne, lno_t *srcs, lno_t *dests, + wt *ew, size_type *xadj, lno_t *adj, wt *crs_ew) { + std::vector> edges(ne); + for (size_type i = 0; i < ne; ++i) { + edges[i].src = srcs[i]; + edges[i].dst = dests[i]; + edges[i].ew = ew[i]; + } + std::sort(edges.begin(), edges.begin() + ne); + + size_type eind = 0; + for (lno_t i = 0; i < nv; ++i) { + (xadj)[i] = eind; + while (edges[eind].src == i) { + (adj)[eind] = edges[eind].dst; + (*crs_ew)[eind] = edges[eind].ew; + ++eind; + } + } + xadj[nv] = eind; +} + +template +void convert_undirected_edge_list_to_csr(lno_t nv, size_type ne, in_lno_t *srcs, + in_lno_t *dests, size_type *xadj, + lno_t *adj) { + std::vector> edges(ne * 2); + for (size_type i = 0; i < ne; ++i) { + edges[i * 2].src = srcs[i]; + edges[i * 2].dst = dests[i]; + + edges[i * 2 + 1].src = dests[i]; + edges[i * 2 + 1].dst = srcs[i]; + } +#ifdef KOKKOSKERNELS_HAVE_OUTER +#include +#include +#include +#include + __gnu_parallel::parallel_sort_mwms< + false, true, struct KokkosKernels::Impl::Edge *>( + &(edges[0]), &(edges[0]) + ne * 2, + std::less>(), 64); +#else + std::sort(edges.begin(), edges.begin() + ne * 2); +#endif + + size_type eind = 0; + for (lno_t i = 0; i < nv; ++i) { + (xadj)[i] = eind; + while (edges[eind].src == i) { + (adj)[eind] = edges[eind].dst; + //(*crs_ew)[eind] = edges[eind].ew; + ++eind; + } + } + xadj[nv] = eind; +} + +template +void write_graph_bin(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename, std::ios::out | std::ios::binary); + myFile.write((char *)&nv, sizeof(lno_t)); + myFile.write((char *)&ne, sizeof(size_type)); + myFile.write((char *)xadj, sizeof(size_type) * (nv + 1)); + + myFile.write((char *)adj, sizeof(lno_t) * (ne)); + + myFile.write((char *)ew, sizeof(scalar_t) * (ne)); + + myFile.close(); +} + +template +void write_graph_crs(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename, std::ios::out); + myFile << nv << " " << ne << std::endl; + + for (lno_t i = 0; i <= nv; ++i) { + myFile << xadj[i] << " "; + } + myFile << std::endl; + + for (lno_t i = 0; i < nv; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << adj[j] << " "; + } + myFile << std::endl; + } + for (size_type i = 0; i < ne; ++i) { + myFile << ew[i] << " "; + } + myFile << std::endl; + + myFile.close(); +} + +template +void write_graph_ligra(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t * /*ew*/, + const char *filename) { + std::ofstream ff(filename); + ff << "AdjacencyGraph" << std::endl; + ff << nv << std::endl << ne << std::endl; + for (lno_t i = 0; i < nv; ++i) { + ff << xadj[i] << std::endl; + } + for (size_type i = 0; i < ne; ++i) { + ff << adj[i] << std::endl; + } + ff.close(); +} + +// MM: types and utility functions for parsing the MatrixMarket format +namespace MM { +enum MtxObject { UNDEFINED_OBJECT, MATRIX, VECTOR }; +enum MtxFormat { UNDEFINED_FORMAT, COORDINATE, ARRAY }; +enum MtxField { + UNDEFINED_FIELD, + REAL, // includes both float and double + COMPLEX, // includes complex and complex + INTEGER, // includes all integer types + PATTERN // not a type, but means the value for every entry is 1 +}; +enum MtxSym { + UNDEFINED_SYMMETRY, + GENERAL, + SYMMETRIC, // A(i, j) = A(j, i) + SKEW_SYMMETRIC, // A(i, j) = -A(j, i) + HERMITIAN // A(i, j) = a + bi; A(j, i) = a - bi +}; + +// readScalar/writeScalar: read and write a scalar in the form that it appears +// in an .mtx file. The >> and << operators won't work, because complex appears +// as "real imag", not "(real, imag)" +template +scalar_t readScalar(std::istream &is) { + scalar_t val; + is >> val; + return val; +} + +template <> +inline Kokkos::complex readScalar(std::istream &is) { + float r, i; + is >> r; + is >> i; + return Kokkos::complex(r, i); +} + +template <> +inline Kokkos::complex readScalar(std::istream &is) { + double r, i; + is >> r; + is >> i; + return Kokkos::complex(r, i); +} + +template +void writeScalar(std::ostream &os, scalar_t val) { + os << val; +} + +template <> +inline void writeScalar(std::ostream &os, Kokkos::complex val) { + os << val.real() << ' ' << val.imag(); +} + +template <> +inline void writeScalar(std::ostream &os, Kokkos::complex val) { + os << val.real() << ' ' << val.imag(); +} + +// symmetryFlip: given a value for A(i, j), return the value that +// should be inserted at A(j, i) (if any) +template +scalar_t symmetryFlip(scalar_t val, MtxSym symFlag) { + if (symFlag == SKEW_SYMMETRIC) return -val; + return val; +} + +template <> +inline Kokkos::complex symmetryFlip(Kokkos::complex val, + MtxSym symFlag) { + if (symFlag == HERMITIAN) + return Kokkos::conj(val); + else if (symFlag == SKEW_SYMMETRIC) + return -val; + return val; +} + +template <> +inline Kokkos::complex symmetryFlip(Kokkos::complex val, + MtxSym symFlag) { + if (symFlag == HERMITIAN) + return Kokkos::conj(val); + else if (symFlag == SKEW_SYMMETRIC) + return -val; + return val; +} +} // namespace MM + +template +void write_matrix_mtx(lno_t nrows, lno_t ncols, size_type nentries, + const size_type *xadj, const lno_t *adj, + const scalar_t *vals, const char *filename) { + std::ofstream myFile(filename); + myFile << "%%MatrixMarket matrix coordinate "; + if (std::is_same>::value || + std::is_same>::value) + myFile << "complex"; + else + myFile << "real"; + myFile << " general\n"; + myFile << nrows << " " << ncols << " " << nentries << '\n'; + myFile << std::setprecision(17) << std::scientific; + for (lno_t i = 0; i < nrows; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << i + 1 << " " << adj[j] + 1 << " "; + MM::writeScalar(myFile, vals[j]); + myFile << '\n'; + } + } + myFile.close(); +} + +template +void write_graph_mtx(lno_t nv, size_type ne, const size_type *xadj, + const lno_t *adj, const scalar_t *ew, + const char *filename) { + std::ofstream myFile(filename); + myFile << "%%MatrixMarket matrix coordinate "; + if (std::is_same>::value || + std::is_same>::value) + myFile << "complex"; + else + myFile << "real"; + myFile << " general\n"; + myFile << nv << " " << nv << " " << ne << '\n'; + myFile << std::setprecision(8) << std::scientific; + for (lno_t i = 0; i < nv; ++i) { + size_type b = xadj[i]; + size_type e = xadj[i + 1]; + for (size_type j = b; j < e; ++j) { + myFile << i + 1 << " " << (adj)[j] + 1 << " "; + MM::writeScalar(myFile, ew[j]); + myFile << '\n'; + } + } + + myFile.close(); +} + +template +void read_graph_bin(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::ifstream myFile(filename, std::ios::in | std::ios::binary); + + myFile.read((char *)nv, sizeof(lno_t)); + myFile.read((char *)ne, sizeof(size_type)); + KokkosKernels::Impl::md_malloc(xadj, *nv + 1); + KokkosKernels::Impl::md_malloc(adj, *ne); + KokkosKernels::Impl::md_malloc(ew, *ne); + myFile.read((char *)*xadj, sizeof(size_type) * (*nv + 1)); + myFile.read((char *)*adj, sizeof(lno_t) * (*ne)); + myFile.read((char *)*ew, sizeof(scalar_t) * (*ne)); + myFile.close(); +} + +// When Kokkos issue #2313 is resolved, can delete +// parseScalar and just use operator>> +template +scalar_t parseScalar(std::istream &is) { + scalar_t val; + is >> val; + return val; +} + +template <> +inline Kokkos::complex parseScalar(std::istream &is) { + std::complex val; + is >> val; + return Kokkos::complex(val); +} + +template <> +inline Kokkos::complex parseScalar(std::istream &is) { + std::complex val; + is >> val; + return Kokkos::complex(val); +} + +template +void read_graph_crs(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::ifstream myFile(filename, std::ios::in); + myFile >> *nv >> *ne; + + KokkosKernels::Impl::md_malloc(xadj, *nv + 1); + KokkosKernels::Impl::md_malloc(adj, *ne); + KokkosKernels::Impl::md_malloc(ew, *ne); + + for (lno_t i = 0; i <= *nv; ++i) { + myFile >> (*xadj)[i]; + } + + for (size_type i = 0; i < *ne; ++i) { + myFile >> (*adj)[i]; + } + for (size_type i = 0; i < *ne; ++i) { + (*ew)[i] = parseScalar(myFile); + } + myFile.close(); +} + +template +void write_kokkos_crst_matrix(crs_matrix_t a_crsmat, const char *filename) { + typedef typename crs_matrix_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crs_matrix_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::value_type offset_t; + typedef typename cols_view_t::value_type lno_t; + typedef typename values_view_t::value_type scalar_t; + typedef typename values_view_t::size_type size_type; + + size_type nnz = a_crsmat.nnz(); + + auto a_rowmap_view = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), a_crsmat.graph.row_map); + auto a_entries_view = Kokkos::create_mirror_view_and_copy( + Kokkos::HostSpace(), a_crsmat.graph.entries); + auto a_values_view = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), a_crsmat.values); + offset_t *a_rowmap = const_cast(a_rowmap_view.data()); + lno_t *a_entries = a_entries_view.data(); + scalar_t *a_values = a_values_view.data(); + + std::string strfilename(filename); + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm")) { + write_matrix_mtx( + a_crsmat.numRows(), a_crsmat.numCols(), a_crsmat.nnz(), a_rowmap, + a_entries, a_values, filename); + return; + } else if (a_crsmat.numRows() != a_crsmat.numCols()) { + throw std::runtime_error( + "For formats other than MatrixMarket (suffix .mm or .mtx),\n" + "write_kokkos_crst_matrix only supports square matrices"); + } + if (KokkosKernels::Impl::endswith(strfilename, ".bin")) { + write_graph_bin( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else if (KokkosKernels::Impl::endswith(strfilename, ".ligra")) { + write_graph_ligra( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) { + write_graph_crs( + a_crsmat.numRows(), nnz, a_rowmap, a_entries, a_values, filename); + } else { + std::string errMsg = + std::string("write_kokkos_crst_matrix: File extension on ") + filename + + " does not correspond to a known format"; + throw std::runtime_error(errMsg); + } +} + +template +int read_mtx(const char *fileName, lno_t *nrows, lno_t *ncols, size_type *ne, + size_type **xadj, lno_t **adj, scalar_t **ew, + bool symmetrize = false, bool remove_diagonal = true, + bool transpose = false) { + using namespace MM; + std::ifstream mmf(fileName, std::ifstream::in); + if (!mmf.is_open()) { + throw std::runtime_error("File cannot be opened\n"); + } + + std::string fline = ""; + getline(mmf, fline); + + if (fline.size() < 2 || fline[0] != '%' || fline[1] != '%') { + throw std::runtime_error("Invalid MM file. Line-1\n"); + } + + // make sure every required field is in the file, by initializing them to + // UNDEFINED_* + MtxObject mtx_object = UNDEFINED_OBJECT; + MtxFormat mtx_format = UNDEFINED_FORMAT; + MtxField mtx_field = UNDEFINED_FIELD; + MtxSym mtx_sym = UNDEFINED_SYMMETRY; + + if (fline.find("matrix") != std::string::npos) { + mtx_object = MATRIX; + } else if (fline.find("vector") != std::string::npos) { + mtx_object = VECTOR; + throw std::runtime_error( + "MatrixMarket \"vector\" is not supported by KokkosKernels read_mtx()"); + } + + if (fline.find("coordinate") != std::string::npos) { + // sparse + mtx_format = COORDINATE; + } else if (fline.find("array") != std::string::npos) { + // dense + mtx_format = ARRAY; + } + + if (fline.find("real") != std::string::npos || + fline.find("double") != std::string::npos) { + if (std::is_same::value || + std::is_same::value) + mtx_field = REAL; + else { + if (!std::is_floating_point::value) + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with float or double typed " + "MatrixMarket file."); + else + mtx_field = REAL; + } + } else if (fline.find("complex") != std::string::npos) { + if (!(std::is_same>::value || + std::is_same>::value)) + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with complex-typed MatrixMarket " + "file."); + else + mtx_field = COMPLEX; + } else if (fline.find("integer") != std::string::npos) { + if (std::is_integral::value || + std::is_floating_point::value || + std::is_same::value || + std::is_same::value) + mtx_field = INTEGER; + else + throw std::runtime_error( + "scalar_t in read_mtx() incompatible with integer-typed MatrixMarket " + "file."); + } else if (fline.find("pattern") != std::string::npos) { + mtx_field = PATTERN; + // any reasonable choice for scalar_t can represent "1" or "1.0 + 0i", so + // nothing to check here + } + + if (fline.find("general") != std::string::npos) { + mtx_sym = GENERAL; + } else if (fline.find("skew-symmetric") != std::string::npos) { + mtx_sym = SKEW_SYMMETRIC; + } else if (fline.find("symmetric") != std::string::npos) { + // checking for "symmetric" after "skew-symmetric" because it's a substring + mtx_sym = SYMMETRIC; + } else if (fline.find("hermitian") != std::string::npos || + fline.find("Hermitian") != std::string::npos) { + mtx_sym = HERMITIAN; + } + // Validate the matrix attributes + if (mtx_format == ARRAY) { + if (mtx_sym == UNDEFINED_SYMMETRY) mtx_sym = GENERAL; + if (mtx_sym != GENERAL) + throw std::runtime_error( + "array format MatrixMarket file must have general symmetry (optional " + "to include \"general\")"); + } + if (mtx_object == UNDEFINED_OBJECT) + throw std::runtime_error( + "MatrixMarket file header is missing the object type."); + if (mtx_format == UNDEFINED_FORMAT) + throw std::runtime_error("MatrixMarket file header is missing the format."); + if (mtx_field == UNDEFINED_FIELD) + throw std::runtime_error( + "MatrixMarket file header is missing the field type."); + if (mtx_sym == UNDEFINED_SYMMETRY) + throw std::runtime_error( + "MatrixMarket file header is missing the symmetry type."); + + while (1) { + getline(mmf, fline); + if (fline[0] != '%') break; + } + std::stringstream ss(fline); + lno_t nr = 0, nc = 0; + size_type nnz = 0; + ss >> nr >> nc; + if (mtx_format == COORDINATE) + ss >> nnz; + else + nnz = nr * nc; + size_type numEdges = nnz; + symmetrize = symmetrize || mtx_sym != GENERAL; + if (symmetrize && nr != nc) { + throw std::runtime_error("A non-square matrix cannot be symmetrized."); + } + if (mtx_format == ARRAY) { + // Array format only supports general symmetry and non-pattern + if (symmetrize) + throw std::runtime_error( + "array format MatrixMarket file cannot be symmetrized."); + if (mtx_field == PATTERN) + throw std::runtime_error( + "array format MatrixMarket file can't have \"pattern\" field type."); + } + if (symmetrize) { + numEdges = 2 * nnz; + } + // numEdges is only an upper bound (diagonal entries may be removed) + std::vector> edges( + numEdges); + size_type nE = 0; + lno_t numDiagonal = 0; + for (size_type i = 0; i < nnz; ++i) { + getline(mmf, fline); + std::stringstream ss2(fline); + struct KokkosKernels::Impl::Edge tmp; + // read source, dest (edge) and weight (value) + lno_t s, d; + scalar_t w; + if (mtx_format == ARRAY) { + // In array format, entries are listed in column major order, + // so the row and column can be determined just from the index i + //(but make them 1-based indices, to match the way coordinate works) + s = i % nr + 1; // row + d = i / nr + 1; // col + } else { + // In coordinate format, row and col of each entry is read from file + ss2 >> s >> d; + } + if (mtx_field == PATTERN) + w = 1; + else + w = readScalar(ss2); + if (!transpose) { + tmp.src = s - 1; + tmp.dst = d - 1; + tmp.ew = w; + } else { + tmp.src = d - 1; + tmp.dst = s - 1; + tmp.ew = w; + } + if (tmp.src == tmp.dst) { + numDiagonal++; + if (!remove_diagonal) { + edges[nE++] = tmp; + } + continue; + } + edges[nE++] = tmp; + if (symmetrize) { + struct KokkosKernels::Impl::Edge tmp2; + tmp2.src = tmp.dst; + tmp2.dst = tmp.src; + // the symmetrized value is w, -w or conj(w) if mtx_sym is + // SYMMETRIC, SKEW_SYMMETRIC or HERMITIAN, respectively. + tmp2.ew = symmetryFlip(tmp.ew, mtx_sym); + edges[nE++] = tmp2; + } + } + mmf.close(); + std::sort(edges.begin(), edges.begin() + nE); + if (transpose) { + lno_t tmp = nr; + nr = nc; + nc = tmp; + } + // idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt + *nrows = nr; + *ncols = nc; + *ne = nE; + //*xadj = new idx[nr + 1]; + KokkosKernels::Impl::md_malloc(xadj, nr + 1); + //*adj = new idx[nE]; + KokkosKernels::Impl::md_malloc(adj, nE); + //*ew = new wt[nE]; + KokkosKernels::Impl::md_malloc(ew, nE); + size_type eind = 0; + size_type actual = 0; + for (lno_t i = 0; i < nr; ++i) { + (*xadj)[i] = actual; + bool is_first = true; + while (eind < nE && edges[eind].src == i) { + if (is_first || !symmetrize || eind == 0 || + (eind > 0 && edges[eind - 1].dst != edges[eind].dst)) { + (*adj)[actual] = edges[eind].dst; + (*ew)[actual] = edges[eind].ew; + ++actual; + } + is_first = false; + ++eind; + } + } + (*xadj)[nr] = actual; + *ne = actual; + return 0; +} + +// Version of read_mtx which does not capture the number of columns. +// This is the old interface; it's kept for backwards compatibility. +template +int read_mtx(const char *fileName, lno_t *nv, size_type *ne, size_type **xadj, + lno_t **adj, scalar_t **ew, bool symmetrize = false, + bool remove_diagonal = true, bool transpose = false) { + lno_t ncol; // will discard + return read_mtx(fileName, nv, &ncol, ne, xadj, + adj, ew, symmetrize, + remove_diagonal, transpose); +} + +template +void read_matrix(lno_t *nv, size_type *ne, size_type **xadj, lno_t **adj, + scalar_t **ew, const char *filename) { + std::string strfilename(filename); + if (KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm")) { + read_mtx(filename, nv, ne, xadj, adj, ew, false, false, false); + } + + else if (KokkosKernels::Impl::endswith(strfilename, ".bin")) { + read_graph_bin(nv, ne, xadj, adj, ew, filename); + } + + else if (KokkosKernels::Impl::endswith(strfilename, ".crs")) { + read_graph_crs(nv, ne, xadj, adj, ew, filename); + } + + else { + throw std::runtime_error("Reader is not available\n"); + } +} + +template +crsMat_t read_kokkos_crst_matrix(const char *filename_) { + std::string strfilename(filename_); + bool isMatrixMarket = KokkosKernels::Impl::endswith(strfilename, ".mtx") || + KokkosKernels::Impl::endswith(strfilename, ".mm"); + + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type row_map_view_t; + typedef typename graph_t::entries_type::non_const_type cols_view_t; + typedef typename crsMat_t::values_type::non_const_type values_view_t; + + typedef typename row_map_view_t::value_type size_type; + typedef typename cols_view_t::value_type lno_t; + typedef typename values_view_t::value_type scalar_t; + + lno_t nr, nc, *adj; + size_type *xadj, nnzA; + scalar_t *values; + + if (isMatrixMarket) { + // MatrixMarket file contains the exact number of columns + read_mtx(filename_, &nr, &nc, &nnzA, &xadj, + &adj, &values, false, false, false); + } else { + //.crs and .bin files don't contain #cols, so will compute it later based on + // the entries + read_matrix(&nr, &nnzA, &xadj, &adj, &values, + filename_); + } + + row_map_view_t rowmap_view("rowmap_view", nr + 1); + cols_view_t columns_view("colsmap_view", nnzA); + values_view_t values_view("values_view", nnzA); + + { + Kokkos::View> + hr(xadj, nr + 1); + Kokkos::View> + hc(adj, nnzA); + Kokkos::View> + hv(values, nnzA); + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + Kokkos::deep_copy(values_view, hv); + } + + if (!isMatrixMarket) { + KokkosKernels::Impl::kk_view_reduce_max( + nnzA, columns_view, nc); + nc++; + } + + graph_t static_graph(columns_view, rowmap_view); + crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph); + delete[] xadj; + delete[] adj; + delete[] values; + return crsmat; +} + +template +crsGraph_t read_kokkos_crst_graph(const char *filename_) { + typedef typename crsGraph_t::row_map_type::non_const_type row_map_view_t; + typedef typename crsGraph_t::entries_type::non_const_type cols_view_t; + + typedef typename row_map_view_t::value_type size_type; + typedef typename cols_view_t::value_type lno_t; + typedef double scalar_t; + + lno_t nv, *adj; + size_type *xadj, nnzA; + scalar_t *values; + read_matrix(&nv, &nnzA, &xadj, &adj, &values, + filename_); + + row_map_view_t rowmap_view("rowmap_view", nv + 1); + cols_view_t columns_view("colsmap_view", nnzA); + + { + typename row_map_view_t::HostMirror hr = + Kokkos::create_mirror_view(rowmap_view); + typename cols_view_t::HostMirror hc = + Kokkos::create_mirror_view(columns_view); + + for (lno_t i = 0; i <= nv; ++i) { + hr(i) = xadj[i]; + } + + for (size_type i = 0; i < nnzA; ++i) { + hc(i) = adj[i]; + } + Kokkos::deep_copy(rowmap_view, hr); + Kokkos::deep_copy(columns_view, hc); + } + + lno_t ncols = 0; + KokkosKernels::Impl::kk_view_reduce_max( + nnzA, columns_view, ncols); + ncols += 1; + + crsGraph_t static_graph(columns_view, rowmap_view, ncols); + delete[] xadj; + delete[] adj; + delete[] values; + return static_graph; +} + +template +inline void kk_sequential_create_incidence_matrix( + nnz_lno_t num_rows, const size_type *xadj, const nnz_lno_t *adj, + size_type *i_adj // output. preallocated +) { + std::vector c_xadj(num_rows); + for (nnz_lno_t i = 0; i < num_rows; i++) { + c_xadj[i] = xadj[i]; + } + int eCnt = 0; + for (nnz_lno_t i = 0; i < num_rows; i++) { + size_type begin = xadj[i]; + size_type end = xadj[i + 1]; + nnz_lno_t adjsize = end - begin; + + for (nnz_lno_t j = 0; j < adjsize; j++) { + size_type aind = j + begin; + nnz_lno_t col = adj[aind]; + if (i < col) { + i_adj[c_xadj[i]++] = eCnt; + i_adj[c_xadj[col]++] = eCnt++; + } + } + } + + for (nnz_lno_t i = 0; i < num_rows; i++) { + if (c_xadj[i] != xadj[i + 1]) { + std::cout << "i:" << i << " c_xadj[i]:" << c_xadj[i] + << " xadj[i+1]:" << xadj[i + 1] << std::endl; + } + } +} + +template +inline void kk_sequential_create_incidence_matrix_transpose( + const nnz_lno_t num_rows, const size_type num_edges, const size_type *xadj, + const nnz_lno_t *adj, + size_type *i_xadj, // output. preallocated + nnz_lno_t *i_adj // output. preallocated +) { + for (nnz_lno_t i = 0; i < num_edges / 2 + 1; i++) { + i_xadj[i] = i * 2; + } + int eCnt = 0; + for (nnz_lno_t i = 0; i < num_rows; i++) { + size_type begin = xadj[i]; + size_type end = xadj[i + 1]; + nnz_lno_t adjsize = end - begin; + + for (nnz_lno_t j = 0; j < adjsize; j++) { + size_type aind = j + begin; + nnz_lno_t col = adj[aind]; + if (i < col) { + i_adj[eCnt++] = i; + i_adj[eCnt++] = col; + } + } + } +} + +} // namespace Impl +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_IOUTILS_HPP diff --git a/src/sparse/KokkosSparse_SortCrs.hpp b/src/sparse/KokkosSparse_SortCrs.hpp new file mode 100644 index 0000000000..68de6b5f7c --- /dev/null +++ b/src/sparse/KokkosSparse_SortCrs.hpp @@ -0,0 +1,722 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef _KOKKOSSPARSE_SORTCRS_HPP +#define _KOKKOSSPARSE_SORTCRS_HPP + +#include "Kokkos_Core.hpp" +#include "KokkosKernels_Sorting.hpp" + +namespace KokkosSparse { + +// ---------------------------------- +// BSR matrix/graph sorting utilities +// ---------------------------------- + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values); + +template +void sort_bsr_matrix(const bsrMat_t& A); + +// ---------------------------------- +// CRS matrix/graph sorting utilities +// ---------------------------------- + +// The sort_crs* functions sort the adjacent column list for each row into +// ascending order. + +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values); + +template +void sort_crs_matrix(const crsMat_t& A); + +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries); + +template +void sort_crs_graph(const crsGraph_t& G); + +// sort_and_merge_matrix produces a new matrix which is equivalent to A but is +// sorted and has no duplicated entries: each (i, j) is unique. Values for +// duplicated entries are summed. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A); + +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G); + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out); + +namespace Impl { + +template +struct SortCrsMatrixFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + using values_managed_t = Kokkos::View; + + SortCrsMatrixFunctor(bool usingRangePol, const rowmap_t& rowmap_, + const entries_t& entries_, const values_t& values_) + : rowmap(rowmap_), entries(entries_), values(values_) { + if (usingRangePol) { + entriesAux = entries_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), + entries.extent(0)); + valuesAux = values_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values aux"), + values.extent(0)); + } + // otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort2( + (unsigned_lno_t*)entries.data() + rowStart, + (unsigned_lno_t*)entriesAux.data() + rowStart, values.data() + rowStart, + valuesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort2( + entries.data() + rowStart, values.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; + values_t values; + values_managed_t valuesAux; +}; + +template +struct SortCrsGraphFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using team_mem = typename Kokkos::TeamPolicy::member_type; + // The functor owns memory for entriesAux, so it can't have + // MemoryTraits + using entries_managed_t = Kokkos::View; + + SortCrsGraphFunctor(bool usingRangePol, const rowmap_t& rowmap_, + const entries_t& entries_) + : rowmap(rowmap_), entries(entries_) { + if (usingRangePol) { + entriesAux = entries_managed_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries aux"), + entries.extent(0)); + } + // otherwise, aux arrays won't be allocated (sorting in place) + } + + KOKKOS_INLINE_FUNCTION void operator()(const lno_t i) const { + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + // Radix sort requires unsigned keys for comparison + using unsigned_lno_t = typename std::make_unsigned::type; + KokkosKernels::SerialRadixSort( + (unsigned_lno_t*)entries.data() + rowStart, + (unsigned_lno_t*)entriesAux.data() + rowStart, rowNum); + } + + KOKKOS_INLINE_FUNCTION void operator()(const team_mem t) const { + size_type i = t.league_rank(); + size_type rowStart = rowmap(i); + size_type rowEnd = rowmap(i + 1); + lno_t rowNum = rowEnd - rowStart; + KokkosKernels::TeamBitonicSort( + entries.data() + rowStart, rowNum, t); + } + + rowmap_t rowmap; + entries_t entries; + entries_managed_t entriesAux; +}; + +template +struct MergedRowmapFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using c_rowmap_t = typename rowmap_t::const_type; + + // Precondition: entries are sorted within each row + MergedRowmapFunctor(const rowmap_t& mergedCounts_, const c_rowmap_t& rowmap_, + const entries_t& entries_) + : mergedCounts(mergedCounts_), rowmap(rowmap_), entries(entries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row, size_type& lnewNNZ) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with + mergedCounts(row) = 0; + return; + } + // Otherwise, the first entry in the row exists + lno_t uniqueEntries = 1; + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (entries(j - 1) != entries(j)) uniqueEntries++; + } + mergedCounts(row) = uniqueEntries; + lnewNNZ += uniqueEntries; + if (row == lno_t((rowmap.extent(0) - 1) - 1)) mergedCounts(row + 1) = 0; + } + + rowmap_t mergedCounts; + c_rowmap_t rowmap; + entries_t entries; +}; + +template +struct MatrixMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using scalar_t = typename values_t::non_const_value_type; + + // Precondition: entries are sorted within each row + MatrixMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + const values_t& values_, + const rowmap_t& mergedRowmap_, + const entries_t& mergedEntries_, + const values_t& mergedValues_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + mergedRowmap(mergedRowmap_), + mergedEntries(mergedEntries_), + mergedValues(mergedValues_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + scalar_t accumVal = values(rowBegin); + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol == entries(j)) { + // accumulate + accumVal += values(j); + } else { + // write out and reset + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + insertPos++; + accumVal = values(j); + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedValues(insertPos) = accumVal; + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + values_t values; + rowmap_t mergedRowmap; + entries_t mergedEntries; + values_t mergedValues; +}; + +template +struct GraphMergedEntriesFunctor { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + + // Precondition: entries are sorted within each row + GraphMergedEntriesFunctor(const rowmap_t& rowmap_, const entries_t& entries_, + const rowmap_t& mergedRowmap_, + const entries_t& mergedEntries_) + : rowmap(rowmap_), + entries(entries_), + mergedRowmap(mergedRowmap_), + mergedEntries(mergedEntries_) {} + + KOKKOS_INLINE_FUNCTION void operator()(lno_t row) const { + size_type rowBegin = rowmap(row); + size_type rowEnd = rowmap(row + 1); + if (rowEnd == rowBegin) { + // Row was empty to begin with, nothing to do + return; + } + // Otherwise, accumulate the value for each column + lno_t accumCol = entries(rowBegin); + size_type insertPos = mergedRowmap(row); + for (size_type j = rowBegin + 1; j < rowEnd; j++) { + if (accumCol != entries(j)) { + // write out and reset + mergedEntries(insertPos) = accumCol; + insertPos++; + accumCol = entries(j); + } + } + // always left with the last unique entry + mergedEntries(insertPos) = accumCol; + } + + rowmap_t rowmap; + entries_t entries; + rowmap_t mergedRowmap; + entries_t mergedEntries; +}; + +template +KOKKOS_INLINE_FUNCTION void kk_swap(T& a, T& b) { + T t = a; + a = b; + b = t; +} + +template +struct sort_bsr_functor { + using lno_t = typename entries_type::non_const_value_type; + + row_map_type rowmap; + entries_type entries; + values_type values; + const lno_t blocksize; + + sort_bsr_functor(row_map_type rowmap_, entries_type entries_, + values_type values_, const lno_t blocksize_) + : rowmap(rowmap_), + entries(entries_), + values(values_), + blocksize(blocksize_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const lno_t i) const { + const lno_t rowStart = rowmap(i); + const lno_t rowSize = rowmap(i + 1) - rowStart; + auto* e = entries.data() + rowStart; + auto* v = values.data() + rowStart * blocksize; + bool done = false; + while (!done) { + done = true; + for (lno_t j = 1; j < rowSize; ++j) { + const lno_t jp = j - 1; + if (e[jp] <= e[j]) continue; + Impl::kk_swap(e[jp], e[j]); + auto const vb = v + j * blocksize; + auto const vbp = v + jp * blocksize; + for (lno_t k = 0; k < blocksize; + ++k) // std::swap_ranges(vb, vb + blocksize, vbp); + Impl::kk_swap(vb[k], vbp[k]); + done = false; + } + } + } +}; + +} // namespace Impl + +// Sort a CRS matrix: within each row, sort entries ascending by column. +// At the same time, permute the values. +template +void sort_crs_matrix(const rowmap_t& rowmap, const entries_t& entries, + const values_t& values) { + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + Impl::SortCrsMatrixFunctor + funct(useRadix, rowmap, entries, values); + if (useRadix) { + Kokkos::parallel_for("sort_crs_matrix", + Kokkos::RangePolicy(0, numRows), + funct); + } else { + // Try to get teamsize to be largest power of 2 not greater than avg entries + // per row + // TODO (probably important for performnce): add thread-level sort also, and + // use that for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (idealTeamSize < avgDeg / 2) { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_matrix", team_pol(numRows, teamSize), funct); + } +} + +template +void sort_crs_matrix(const crsMat_t& A) { + // Note: rowmap_t has const values, but that's OK as sorting doesn't modify it + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using exec_space = typename crsMat_t::execution_space; + // NOTE: the rowmap of a StaticCrsGraph is const-valued, but the + // entries and CrsMatrix values are non-const (so sorting them directly + // is allowed) + sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); +} + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +void sort_bsr_matrix(const lno_t blockdim, const rowmap_t& rowmap, + const entries_t& entries, const values_t& values) { + // TODO: this is O(N^2) mock for debugging - do regular implementation based + // on Radix/Bitonic sort (like CSR) IDEA: maybe we need only one general + // Radix2/Bitonic2 and CSR sorting may call it with blockSize=1 ? + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + const lno_t blocksize = blockdim * blockdim; + + assert(values.extent(0) == entries.extent(0) * blocksize); + Impl::sort_bsr_functor bsr_sorter( + rowmap, entries, values, blocksize); + Kokkos::parallel_for("sort_bsr_matrix", + Kokkos::RangePolicy(0, numRows), + bsr_sorter); +} + +// Sort a BSR matrix (like CRS but single values are replaced with contignous +// blocks) +template +void sort_bsr_matrix(const bsrMat_t& A) { + // NOTE: unlike rowmap, entries and values are non-const, so we can sort them + // directly + sort_bsr_matrix( + A.blockDim(), A.graph.row_map, A.graph.entries, A.values); +} + +// Sort a CRS graph: within each row, sort entries ascending by column. +template +void sort_crs_graph(const rowmap_t& rowmap, const entries_t& entries) { + using lno_t = typename entries_t::non_const_value_type; + using team_pol = Kokkos::TeamPolicy; + bool useRadix = !KokkosKernels::Impl::kk_is_gpu_exec_space(); + lno_t numRows = rowmap.extent(0) ? rowmap.extent(0) - 1 : 0; + if (numRows == 0) return; + Impl::SortCrsGraphFunctor funct( + useRadix, rowmap, entries); + if (useRadix) { + Kokkos::parallel_for("sort_crs_graph", + Kokkos::RangePolicy(0, numRows), + funct); + } else { + // Try to get teamsize to be largest power of 2 less than or equal to + // half the entries per row. 0.5 * #entries is bitonic's parallelism within + // a row. + // TODO (probably important for performnce): add thread-level sort also, and + // use that for small avg degree. But this works for now. + lno_t idealTeamSize = 1; + lno_t avgDeg = (entries.extent(0) + numRows - 1) / numRows; + while (idealTeamSize < avgDeg / 2) { + idealTeamSize *= 2; + } + team_pol temp(numRows, 1); + lno_t maxTeamSize = temp.team_size_max(funct, Kokkos::ParallelForTag()); + lno_t teamSize = std::min(idealTeamSize, maxTeamSize); + Kokkos::parallel_for("sort_crs_graph", team_pol(numRows, teamSize), funct); + } +} + +template +void sort_crs_graph(const crsGraph_t& G) { + static_assert( + !std::is_const::value, + "sort_crs_graph requires StaticCrsGraph entries to be non-const."); + sort_crs_graph(G.row_map, G.entries); +} + +// Sort the rows of matrix, and merge duplicate entries. +template +crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + using c_rowmap_t = typename crsMat_t::row_map_type; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + using size_type = typename rowmap_t::non_const_value_type; + using exec_space = typename crsMat_t::execution_space; + using range_t = Kokkos::RangePolicy; + sort_crs_matrix(A); + // Count entries per row into a new rowmap, in terms of merges that can be + // done + rowmap_t mergedRowmap( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), + A.numRows() + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, A.numRows()), + Impl::MergedRowmapFunctor( + mergedRowmap, A.graph.row_map, A.graph.entries), + numCompressedEntries); + // Prefix sum to get rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + A.numRows() + 1, mergedRowmap); + entries_t mergedEntries("SortedMerged entries", numCompressedEntries); + values_t mergedValues("SortedMerged values", numCompressedEntries); + // Compute merged entries and values + Kokkos::parallel_for( + range_t(0, A.numRows()), + Impl::MatrixMergedEntriesFunctor( + A.graph.row_map, A.graph.entries, A.values, mergedRowmap, + mergedEntries, mergedValues)); + // Finally, construct the new compressed matrix + return crsMat_t("SortedMerged", A.numRows(), A.numCols(), + numCompressedEntries, mergedValues, mergedRowmap, + mergedEntries); +} + +template +void sort_and_merge_graph(const typename rowmap_t::const_type& rowmap_in, + const entries_t& entries_in, rowmap_t& rowmap_out, + entries_t& entries_out) { + using size_type = typename rowmap_t::non_const_value_type; + using lno_t = typename entries_t::non_const_value_type; + using range_t = Kokkos::RangePolicy; + using const_rowmap_t = typename rowmap_t::const_type; + lno_t numRows = rowmap_in.extent(0); + if (numRows <= 1) { + // Matrix has zero rows + rowmap_out = rowmap_t(); + entries_out = entries_t(); + return; + } + numRows--; + // Sort in place + sort_crs_graph(rowmap_in, entries_in); + // Count entries per row into a new rowmap, in terms of merges that can be + // done + rowmap_out = rowmap_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "SortedMerged rowmap"), + numRows + 1); + size_type numCompressedEntries = 0; + Kokkos::parallel_reduce(range_t(0, numRows), + Impl::MergedRowmapFunctor( + rowmap_out, rowmap_in, entries_in), + numCompressedEntries); + // Prefix sum to get rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + numRows + 1, rowmap_out); + entries_out = entries_t("SortedMerged entries", numCompressedEntries); + // Compute merged entries and values + Kokkos::parallel_for( + range_t(0, numRows), + Impl::GraphMergedEntriesFunctor( + rowmap_in, entries_in, rowmap_out, entries_out)); +} + +template +crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + using rowmap_t = typename crsGraph_t::row_map_type::non_const_type; + using entries_t = typename crsGraph_t::entries_type; + static_assert( + !std::is_const::value, + "sort_and_merge_graph requires StaticCrsGraph entries to be non-const."); + rowmap_t mergedRowmap; + entries_t mergedEntries; + sort_and_merge_graph(G.row_map, G.entries, mergedRowmap, + mergedEntries); + return crsGraph_t(mergedEntries, mergedRowmap); +} + +} // namespace KokkosSparse + +namespace KokkosKernels { + +// ---------------------------------- +// BSR matrix/graph sorting utilities +// ---------------------------------- + +// Sort a BRS matrix: within each row, sort entries ascending by column and +// permute the values accordingly. +template +[[deprecated]] void sort_bsr_matrix(const lno_t blockdim, + const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { + KokkosSparse::sort_bsr_matrix(blockdim, rowmap, entries, values); +} + +template +[[deprecated]] void sort_bsr_matrix(const bsrMat_t& A) { + KokkosSparse::sort_bsr_matrix(A); +} + +// ---------------------------------- +// CRS matrix/graph sorting utilities +// ---------------------------------- + +// The sort_crs* functions sort the adjacent column list for each row into +// ascending order. + +template +[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { + KokkosSparse::sort_crs_matrix( + rowmap, entries, values); +} + +template +[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { + KokkosSparse::sort_crs_matrix(A); +} + +template +[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, + const entries_t& entries) { + KokkosSparse::sort_crs_graph(rowmap, + entries); +} + +template +[[deprecated]] void sort_crs_graph(const crsGraph_t& G) { + KokkosSparse::sort_crs_graph(G); +} + +// sort_and_merge_matrix produces a new matrix which is equivalent to A but is +// sorted and has no duplicated entries: each (i, j) is unique. Values for +// duplicated entries are summed. +template +[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + KokkosSparse::sort_and_merge_matrix(A); +} + +template +[[deprecated]] crsGraph_t sort_and_merge_graph(const crsGraph_t& G) { + KokkosSparse::sort_and_merge_graph(G); +} + +template +[[deprecated]] void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) { + KokkosSparse::sort_and_merge_graph(rowmap_in, entries_in, rowmap_out, + entries_out); +} + +// For backward compatibility: keep the public interface accessible in +// KokkosKernels::Impl:: +namespace Impl { +template +[[deprecated]] void sort_crs_graph(const rowmap_t& rowmap, + const entries_t& entries) { + KokkosKernels::sort_crs_graph(rowmap, + entries); +} + +template +[[deprecated]] void sort_crs_matrix(const rowmap_t& rowmap, + const entries_t& entries, + const values_t& values) { + KokkosKernels::sort_crs_matrix(rowmap, entries, values); +} + +template +[[deprecated]] void sort_crs_matrix(const crsMat_t& A) { + KokkosKernels::sort_crs_matrix(A); +} + +template +[[deprecated]] void sort_and_merge_graph( + const typename rowmap_t::const_type& rowmap_in, const entries_t& entries_in, + rowmap_t& rowmap_out, entries_t& entries_out) { + KokkosKernels::sort_and_merge_graph( + rowmap_in, entries_in, rowmap_out, entries_out); +} + +template +[[deprecated]] crsMat_t sort_and_merge_matrix(const crsMat_t& A) { + return KokkosKernels::sort_and_merge_matrix(A); +} + +} // namespace Impl +} // namespace KokkosKernels + +#endif // _KOKKOSSPARSE_SORTCRS_HPP diff --git a/src/common/KokkosKernels_SparseUtils.hpp b/src/sparse/KokkosSparse_Utils.hpp similarity index 90% rename from src/common/KokkosKernels_SparseUtils.hpp rename to src/sparse/KokkosSparse_Utils.hpp index 323ae7846f..007b2aea85 100644 --- a/src/common/KokkosKernels_SparseUtils.hpp +++ b/src/sparse/KokkosSparse_Utils.hpp @@ -57,7 +57,7 @@ #include #endif -namespace KokkosKernels { +namespace KokkosSparse { enum SparseMatrixFormat { BlockCRS, @@ -72,7 +72,7 @@ namespace Impl { template -void kk_create_blockcrs_formated_point_crsmatrix( +void kk_create_blockcrs_formatted_point_crsmatrix( int block_size, size_t num_rows, size_t num_cols, in_row_view_t in_xadj, in_nnz_view_t in_adj, in_val_view_t in_vals, @@ -293,17 +293,17 @@ struct TransposeMatrix { struct CountTag {}; struct FillTag {}; - typedef Kokkos::TeamPolicy team_count_policy_t; - typedef Kokkos::TeamPolicy team_fill_policy_t; + using team_count_policy_t = Kokkos::TeamPolicy; + using team_fill_policy_t = Kokkos::TeamPolicy; - typedef typename team_count_policy_t::member_type team_count_member_t; - typedef typename team_fill_policy_t::member_type team_fill_member_t; + using team_count_member_t = typename team_count_policy_t::member_type; + using team_fill_member_t = typename team_fill_policy_t::member_type; - typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t; - typedef typename in_row_view_t::non_const_value_type size_type; + using nnz_lno_t = typename in_nnz_view_t::non_const_value_type; + using size_type = typename in_row_view_t::non_const_value_type; - typename in_nnz_view_t::non_const_value_type num_rows; - typename in_nnz_view_t::non_const_value_type num_cols; + nnz_lno_t num_rows; + nnz_lno_t num_cols; in_row_view_t xadj; in_nnz_view_t adj; in_scalar_view_t vals; @@ -425,11 +425,12 @@ void transpose_matrix( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, kk_get_exec_space_type()); + num_rows, nnz, + KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( - thread_size, kk_get_exec_space_type()); + thread_size, KokkosKernels::Impl::kk_get_exec_space_type()); TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, vals, t_xadj, t_adj, t_vals, tmp_row_view, true, team_size); @@ -439,8 +440,9 @@ void transpose_matrix( team_size, thread_size), tm); - kk_exclusive_parallel_prefix_sum(num_cols + 1, - t_xadj); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -508,11 +510,12 @@ void transpose_graph( // determine vector lanes per thread int thread_size = kk_get_suggested_vector_size( - num_rows, nnz, kk_get_exec_space_type()); + num_rows, nnz, + KokkosKernels::Impl::kk_get_exec_space_type()); // determine threads per team int team_size = kk_get_suggested_team_size( - thread_size, kk_get_exec_space_type()); + thread_size, KokkosKernels::Impl::kk_get_exec_space_type()); TransposeFunctor_t tm(num_rows, num_cols, xadj, adj, tmp1, t_xadj, t_adj, tmp2, tmp_row_view, false, team_size); @@ -522,8 +525,9 @@ void transpose_graph( team_size, thread_size), tm); - kk_exclusive_parallel_prefix_sum(num_cols + 1, - t_xadj); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + num_cols + 1, t_xadj); Kokkos::deep_copy(tmp_row_view, t_xadj); @@ -535,6 +539,116 @@ void transpose_graph( MyExecSpace().fence(); } +template +struct TransposeBsrMatrix { + using ordinal_type = typename in_nnz_view_t::non_const_value_type; + using size_type = typename in_row_view_t::non_const_value_type; + + int block_size; + in_row_view_t Arow_map; + in_nnz_view_t Aentries; + in_scalar_view_t Avalues; + out_row_view_t tArow_map; // allocated + out_nnz_view_t tAentries; // allocated + out_scalar_view_t tAvalues; // allocated + + TransposeBsrMatrix(const int blockSize, in_row_view_t row_mapA, + in_nnz_view_t entriesA, in_scalar_view_t valuesA, + out_row_view_t row_mapAt, out_nnz_view_t entriesAt, + out_scalar_view_t valuesAt) + : block_size(blockSize), + Arow_map(row_mapA), + Aentries(entriesA), + Avalues(valuesA), + tArow_map(row_mapAt), + tAentries(entriesAt), + tAvalues(valuesAt){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const int tArowIdx) const { + // Loop over entries in row + for (size_type tAentryIdx = tArow_map(tArowIdx); + tAentryIdx < tArow_map(tArowIdx + 1); ++tAentryIdx) { + ordinal_type tAcolIdx = tAentries(tAentryIdx); + + // we have block tA(tArowIdx, tAcolIdx) starting at tAvalues(entryIdx) + // we need to find AentryIdx corresponding to A(tAcolIdx, tArowIdx) + size_type AentryIdx; + for (AentryIdx = Arow_map(tAcolIdx); AentryIdx < Arow_map(tAcolIdx + 1); + ++AentryIdx) { + if (tArowIdx == Aentries(AentryIdx)) break; + } + + // we loop over block_size*block_size Avalues starting at AentryIdx + // and store them into tAvalues in transpose order starting at tAentryIdx + for (int i = 0; i < block_size; ++i) { + for (int j = 0; j < block_size; ++j) { + tAvalues(tAentryIdx * block_size * block_size + i * block_size + j) = + Avalues(AentryIdx * block_size * block_size + j * block_size + i); + } + } + } + } +}; // TransposeBsrMatrix + +template +void transpose_bsr_matrix( + typename in_nnz_view_t::non_const_value_type num_rows, + typename in_nnz_view_t::non_const_value_type num_cols, const int block_size, + in_row_view_t xadj, in_nnz_view_t adj, in_scalar_view_t vals, + out_row_view_t t_xadj, // pre-allocated -- initialized with 0 + out_nnz_view_t t_adj, // pre-allocated -- no need for initialize + out_scalar_view_t t_vals // pre-allocated -- no need for initialize +) { + using TransposeBsrFunctor_type = + TransposeBsrMatrix; + + // Step 1: call transpose_graph of bsr matrix + transpose_graph(num_rows, num_cols, xadj, adj, + t_xadj, t_adj); + + // Step 2: transpose the values of A + Kokkos::RangePolicy my_policy(0, num_cols); + TransposeBsrFunctor_type my_functor(block_size, xadj, adj, vals, t_xadj, + t_adj, t_vals); + + Kokkos::parallel_for(my_policy, my_functor); + MyExecSpace().fence(); +} + +template +bsrMat_t transpose_bsr_matrix(const bsrMat_t &A) { + // Allocate views and call the other version of transpose_matrix + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using c_values_t = typename bsrMat_t::values_type; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + rowmap_t AT_rowmap("Transpose rowmap", A.numCols() + 1); + entries_t AT_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose entries"), + A.nnz()); + values_t AT_values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Transpose values"), + A.nnz() * A.blockDim() * A.blockDim()); + transpose_bsr_matrix( + A.numRows(), A.numCols(), A.blockDim(), A.graph.row_map, A.graph.entries, + A.values, AT_rowmap, AT_entries, AT_values); + // And construct the transpose crsMat_t + return bsrMat_t("Transpose", A.numCols(), A.numRows(), A.nnz(), AT_values, + AT_rowmap, AT_entries, A.blockDim()); +} + template struct Fill_Reverse_Scale_Functor { struct CountTag {}; @@ -715,7 +829,8 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(tmp_reverse_size + 1, tmp_color_xadj); - kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -750,7 +865,8 @@ void kk_create_reverse_map( // kk_inclusive_parallel_prefix_sum(num_reverse_elements + 1, reverse_map_xadj); - kk_exclusive_parallel_prefix_sum( + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( num_reverse_elements + 1, tmp_color_xadj); MyExecSpace().fence(); @@ -843,7 +959,8 @@ inline size_t kk_is_d1_coloring_valid( typename in_nnz_view_t::non_const_value_type num_rows, typename in_nnz_view_t::non_const_value_type /*num_cols*/, in_row_view_t xadj, in_nnz_view_t adj, in_color_view_t v_colors) { - ExecSpaceType my_exec_space = kk_get_exec_space_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space = + KokkosKernels::Impl::kk_get_exec_space_type(); int vector_size = kk_get_suggested_vector_size(num_rows, adj.extent(0), my_exec_space); int suggested_team_size = @@ -926,160 +1043,6 @@ void graph_min_max_degree(const rowmap_t &rowmap, ordinal_t &min_degree, max_degree = result.max_val; } -/* -template -struct IncidenceMatrix{ - - struct FillTag{}; - - typedef struct FillTag FillTag; - - typedef Kokkos::TeamPolicy team_fill_policy_t ; - typedef Kokkos::TeamPolicy > dynamic_team_fill_policy_t ; typedef -typename team_fill_policy_t::member_type team_fill_member_t ; - - typedef typename in_nnz_view_t::non_const_value_type nnz_lno_t; - typedef typename in_row_view_t::non_const_value_type size_type; - - - typename in_nnz_view_t::non_const_value_type num_rows; - in_row_view_t xadj; - in_nnz_view_t adj; - out_nnz_view_t t_adj; //allocated - typename in_row_view_t::non_const_type tmp_txadj; - nnz_lno_t team_work_size; - - IncidenceMatrix( - nnz_lno_t num_rows_, - in_row_view_t xadj_, - in_nnz_view_t adj_, - out_nnz_view_t t_adj_, - typename in_row_view_t::non_const_type tmp_txadj_, - nnz_lno_t team_row_work_size_): - num_rows(num_rows_), - xadj(xadj_), adj(adj_), - t_adj(t_adj_), - tmp_txadj(tmp_txadj_), team_work_size(team_row_work_size_) {} - - - KOKKOS_INLINE_FUNCTION - void operator()(const FillTag&, const team_fill_member_t & teamMember) const { - const nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; - const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN(team_row_begin + -team_work_size, num_rows); - - - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember,team_row_begin,team_row_end), -[&] (const nnz_lno_t& row_index) { const size_type col_begin = xadj[row_index]; - const size_type col_end = xadj[row_index + 1]; - const nnz_lno_t left_work = col_end - col_begin; - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(teamMember, left_work), - [&] (nnz_lno_t i) { - const size_type adjind = i + col_begin; - const nnz_lno_t colIndex = adj[adjind]; - if (row_index < colIndex){ - - const size_type pos = -Kokkos::atomic_fetch_add(&(tmp_txadj(colIndex)),1); t_adj(adjind) = adjind; - t_adj(pos) = adjind; - } - }); - //} - }); - } -}; -*/ -/** - * \brief function returns transpose of the given graph. - * \param num_rows: num rows in input graph - * \param num_cols: num cols in input graph - * \param xadj: row pointers of the input graph - * \param adj: column indices of the input graph - * \param t_xadj: output, the row indices of the output graph. MUST BE - * INITIALIZED WITH ZEROES. \param t_adj: output, column indices. No need for - * initializations. \param vector_size: suggested vector size, optional. if -1, - * kernel will decide. \param suggested_team_size: suggested team size, - * optional. if -1, kernel will decide. \param team_work_chunk_size: suggested - * work size of a team, optional. if -1, kernel will decide. \param - * use_dynamic_scheduling: whether to use dynamic scheduling. Default is true. - */ -/* -template -inline void kk_create_incidence_matrix( - typename in_nnz_view_t::non_const_value_type num_rows, - in_row_view_t xadj, - in_nnz_view_t adj, - out_nnz_view_t i_adj, //pre-allocated -- no need for initialize -- size is -same as adj int vector_size = -1, int suggested_team_size = -1, typename -in_nnz_view_t::non_const_value_type team_work_chunk_size = -1, bool -use_dynamic_scheduling = true - ){ - - - typedef typename in_row_view_t::non_const_type tmp_row_view_t; - //allocate some memory for work for row pointers - tmp_row_view_t tmp_row_view(Kokkos::view_alloc(Kokkos::WithoutInitializing, -"tmp_row_view"), num_rows + 1); - - Kokkos::deep_copy(tmp_row_view, xadj); - - in_nnz_view_t tmp1; - out_nnz_view_t tmp2; - - //create the functor for tranpose. - typedef IncidenceMatrix < - in_row_view_t, in_nnz_view_t, in_nnz_view_t, - out_nnz_view_t, MyExecSpace> IncidenceMatrix_Functor_t; - - IncidenceMatrix_Functor_t tm ( num_rows, xadj, adj, - t_adj, tmp_row_view, - false, - team_work_chunk_size); - - - typedef typename IncidenceMatrix_Functor_t::team_fill_policy_t fill_tp_t; - typedef typename IncidenceMatrix_Functor_t::dynamic_team_fill_policy_t -d_fill_tp_t; - - typename in_row_view_t::non_const_value_type nnz = adj.extent(0); - - //set the vector size, if not suggested. - if (vector_size == -1) - vector_size = kk_get_suggested_vector_size(num_rows, nnz, -kk_get_exec_space_type()); - - //set the team size, if not suggested. - if (suggested_team_size == -1) - suggested_team_size = kk_get_suggested_team_size(vector_size, -kk_get_exec_space_type()); - - //set the chunk size, if not suggested. - if (team_work_chunk_size == -1) - team_work_chunk_size = suggested_team_size; - - - - if (use_dynamic_scheduling){ - Kokkos::parallel_for( fill_tp_t(num_rows / team_work_chunk_size + 1 , -suggested_team_size, vector_size), tm); - } - else { - Kokkos::parallel_for( d_fill_tp_t(num_rows / team_work_chunk_size + 1 , -suggested_team_size, vector_size), tm); - } - MyExecSpace().fence(); - -} -*/ - template void kk_get_lower_triangle_count_sequential(const lno_t nv, const size_type *in_xadj, @@ -1140,7 +1103,7 @@ struct LowerTriangularMatrix { scalar_t *t_vals; const lno_t team_work_size; - const ExecSpaceType exec_space; + const KokkosKernels::Impl::ExecSpaceType exec_space; const bool is_lower; LowerTriangularMatrix(const lno_t num_rows_, const size_type *xadj_, @@ -1157,7 +1120,8 @@ struct LowerTriangularMatrix { t_adj(t_adj_), t_vals(out_vals_), team_work_size(team_row_work_size_), - exec_space(kk_get_exec_space_type()), + exec_space( + KokkosKernels::Impl::kk_get_exec_space_type()), is_lower(is_lower_) {} KOKKOS_INLINE_FUNCTION @@ -1274,9 +1238,10 @@ void kk_get_lower_triangle_count_parallel( bool use_dynamic_scheduling = false, int chunksize = 4, bool is_lower = true) { const int vector_size = kk_get_suggested_vector_size( - nv, ne, kk_get_exec_space_type()); + nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, kk_get_exec_space_type()); + vector_size, + KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix ltm_t; @@ -1439,9 +1404,10 @@ void kk_get_lower_triangle_fill_parallel( bool use_dynamic_scheduling = false, bool chunksize = 4, bool is_lower = true) { const int vector_size = kk_get_suggested_vector_size( - nv, ne, kk_get_exec_space_type()); + nv, ne, KokkosKernels::Impl::kk_get_exec_space_type()); const int suggested_team_size = kk_get_suggested_team_size( - vector_size, kk_get_exec_space_type()); + vector_size, + KokkosKernels::Impl::kk_get_exec_space_type()); const int team_work_chunk_size = suggested_team_size * chunksize; typedef LowerTriangularMatrix @@ -1573,8 +1539,9 @@ crstmat_t kk_get_lower_triangle( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1630,8 +1597,9 @@ crstmat_t kk_get_lower_crs_matrix( nr, ne, rowmap, entries, new_row_map.data(), new_indices, use_dynamic_scheduling, chunksize); - kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1683,8 +1651,9 @@ graph_t kk_get_lower_crs_graph(graph_t in_crs_matrix, kk_get_lower_triangle_count( nr, ne, rowmap, entries, new_row_map.data(), new_indices); - kk_exclusive_parallel_prefix_sum(nr + 1, - new_row_map); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nr + 1, new_row_map); exec_space().fence(); auto ll_size = Kokkos::subview(new_row_map, nr); @@ -1736,8 +1705,9 @@ void kk_get_lower_triangle(typename cols_view_t::non_const_value_type nr, nr, ne, rowmap, entries, out_rowmap.data(), new_indices.data(), use_dynamic_scheduling, chunksize, is_lower); - kk_exclusive_parallel_prefix_sum(nr + 1, - out_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + out_rowmap); exec_space().fence(); auto ll_size = Kokkos::subview(out_rowmap, nr); @@ -1844,8 +1814,9 @@ void kk_create_incidence_matrix_from_original_matrix( permutation.data(), use_dynamic_scheduling, chunksize, sort_decreasing_order); exec_space().fence(); - kk_exclusive_parallel_prefix_sum(nr + 1, - out_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum(nr + 1, + out_rowmap); // kk_print_1Dview(out_rowmap, false, 20); @@ -2069,21 +2040,21 @@ template struct MatrixTraits< KokkosSparse::CrsMatrix> { - static constexpr auto format = KokkosKernels::CRS; + static constexpr auto format = KokkosSparse::CRS; }; template struct MatrixTraits> { - static constexpr auto format = KokkosKernels::BlockCRS; + static constexpr auto format = KokkosSparse::BlockCRS; }; template struct MatrixTraits> { - static constexpr auto format = KokkosKernels::BSR; + static constexpr auto format = KokkosSparse::BSR; }; template @@ -2097,7 +2068,7 @@ struct MatrixConverter { KokkosSparse::CrsMatrix, typename blockCrsMat_t = KokkosSparse::Experimental::BlockCrsMatrix< scalar_t, lno_t, device, void, size_type>> - static blockCrsMat_t from_blockcrs_formated_point_crsmatrix( + static blockCrsMat_t from_blockcrs_formatted_point_crsmatrix( const KokkosSparse::CrsMatrix &mtx, lno_t block_size) { @@ -2111,7 +2082,7 @@ struct MatrixConverter { typename device, typename bsrMtx_t = KokkosSparse::Experimental::BsrMatrix< scalar_t, lno_t, device, void, size_type>> - static bsrMtx_t from_blockcrs_formated_point_crsmatrix( + static bsrMtx_t from_blockcrs_formatted_point_crsmatrix( const KokkosSparse::CrsMatrix &mtx, lno_t block_size) { @@ -2120,6 +2091,17 @@ struct MatrixConverter { }; } // namespace Impl +} // namespace KokkosSparse + +namespace KokkosKernels { + +enum [[deprecated]] SparseMatrixFormat{ + BlockCRS, BSR, + CRS = BlockCRS, // convenience alias: for block_size=1 or no-blocks there + // is no difference in value ordering (so the format tag + // becomes irrelevant) +}; + } // namespace KokkosKernels #endif diff --git a/src/common/KokkosKernels_SparseUtils_cusparse.hpp b/src/sparse/KokkosSparse_Utils_cusparse.hpp similarity index 64% rename from src/common/KokkosKernels_SparseUtils_cusparse.hpp rename to src/sparse/KokkosSparse_Utils_cusparse.hpp index ea9bfd37dd..6e9eee5ab5 100644 --- a/src/common/KokkosKernels_SparseUtils_cusparse.hpp +++ b/src/sparse/KokkosSparse_Utils_cusparse.hpp @@ -114,6 +114,83 @@ inline void cusparse_internal_safe_call(cusparseStatus_t cusparseStatus, KokkosSparse::Impl::cusparse_internal_safe_call(call, #call, __FILE__, \ __LINE__) +template +cudaDataType cuda_data_type_from() { + // compile-time failure with a nice message if called on an unsupported type + static_assert(!std::is_same::value, + "cuSparse TPL does not support scalar type"); + // static_assert(false, ...) is allowed to error even if the code is not + // instantiated. obfuscate the predicate Despite this function being + // uncompilable, the compiler may decide that a return statement is missing, + // so throw to silence that + throw std::logic_error("unreachable throw after static_assert"); +} + +/* If half_t is not float, need to define a conversion for both + otherwise, conversion for half_t IS conversion for float +*/ +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_16F; // Kokkos half_t is a half +} +#endif +// half_t is defined to be float, so this works for both half_t and float when +// half_t is float +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_32F; // Kokkos half_t is a float +} +template <> +inline cudaDataType cuda_data_type_from() { + return CUDA_R_64F; +} +template <> +inline cudaDataType cuda_data_type_from>() { + return CUDA_C_32F; +} +template <> +inline cudaDataType cuda_data_type_from>() { + return CUDA_C_32F; +} + +#if defined(CUSPARSE_VERSION) && (10300 <= CUSPARSE_VERSION) + +template +cusparseIndexType_t cusparse_index_type_t_from() { +#define AS_STR_LITERAL_IMPL_(x) #x +#define AS_STR_LITERAL(x) AS_STR_LITERAL_IMPL_(x) + static_assert(!std::is_same::value, + "cuSparse " AS_STR_LITERAL( + CUSPARSE_VERSION) " TPL does not support index type"); + // static_assert(false, ...) is allowed to error even if the code is not + // instantiated. obfuscate the predicate Despite this function being + // uncompilable, the compiler may decide that a return statement is missing, + // so throw to silence that + throw std::logic_error("unreachable throw after static_assert"); +#undef AS_STR_LITERAL_IMPL_ +#undef AS_STR_LITERAL +} + +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_32I; +} +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_64I; +} +// Currently no CUSPARSE_INDEX_64U but this will work most of the time +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_64I; +} +template <> +inline cusparseIndexType_t cusparse_index_type_t_from() { + return CUSPARSE_INDEX_16U; +} +#endif + } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/KokkosSparse_Utils_mkl.hpp b/src/sparse/KokkosSparse_Utils_mkl.hpp new file mode 100644 index 0000000000..b9eb3a9bd2 --- /dev/null +++ b/src/sparse/KokkosSparse_Utils_mkl.hpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP +#define _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP + +#include "KokkosKernels_config.h" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + +#include + +namespace KokkosSparse { +namespace Impl { + +inline void mkl_internal_safe_call(sparse_status_t mkl_status, const char *name, + const char *file = nullptr, + const int line = 0) { + if (SPARSE_STATUS_SUCCESS != mkl_status) { + std::ostringstream oss; + oss << "MKL call \"" << name << "\" at " << file << ":" << line + << " encountered error: "; + switch (mkl_status) { + case SPARSE_STATUS_NOT_INITIALIZED: + oss << "SPARSE_STATUS_NOT_INITIALIZED (empty handle or matrix arrays)"; + break; + case SPARSE_STATUS_ALLOC_FAILED: + oss << "SPARSE_STATUS_ALLOC_FAILED (internal error: memory allocation " + "failed)"; + break; + case SPARSE_STATUS_INVALID_VALUE: + oss << "SPARSE_STATUS_INVALID_VALUE (invalid input value)"; + break; + case SPARSE_STATUS_EXECUTION_FAILED: + oss << "SPARSE_STATUS_EXECUTION_FAILED (e.g. 0-diagonal element for " + "triangular solver)"; + break; + case SPARSE_STATUS_INTERNAL_ERROR: + oss << "SPARSE_STATUS_INTERNAL_ERROR"; + break; + case SPARSE_STATUS_NOT_SUPPORTED: + oss << "SPARSE_STATUS_NOT_SUPPORTED (e.g. operation for double " + "precision doesn't support other types)"; + break; + default: oss << "unknown (code " << (int)mkl_status << ")"; break; + } + oss << '\n'; + Kokkos::abort(oss.str().c_str()); + } +} + +#define KOKKOSKERNELS_MKL_SAFE_CALL(call) \ + KokkosSparse::Impl::mkl_internal_safe_call(call, #call, __FILE__, __LINE__) + +inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': return SPARSE_OPERATION_TRANSPOSE; + case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument( + "Invalid mode for MKL (should be one of N, T, H)"); +} + +template +struct mkl_is_supported_value_type : std::false_type {}; + +template <> +struct mkl_is_supported_value_type : std::true_type {}; +template <> +struct mkl_is_supported_value_type : std::true_type {}; +template <> +struct mkl_is_supported_value_type> : std::true_type {}; +template <> +struct mkl_is_supported_value_type> : std::true_type {}; + +// MKLSparseMatrix provides thin wrapper around MKL matrix handle +// (sparse_matrix_t) and encapsulates MKL call dispatches related to details +// like value_type, allowing simple client code in kernels. +template +class MKLSparseMatrix { + sparse_matrix_t mtx; + + static_assert(mkl_is_supported_value_type::value, + "Scalar type used in MKLSparseMatrix is NOT " + "supported by MKL"); + + public: + inline MKLSparseMatrix(sparse_matrix_t mtx_) : mtx(mtx_) {} + + // Constructs MKL sparse matrix from KK sparse views (m rows x n cols) + inline MKLSparseMatrix(const MKL_INT num_rows, const MKL_INT num_cols, + MKL_INT *xadj, MKL_INT *adj, value_type *values); + + // Allows using MKLSparseMatrix directly in MKL calls + inline operator sparse_matrix_t() const { return mtx; } + + // Exports MKL sparse matrix contents into KK views + inline void export_data(MKL_INT &num_rows, MKL_INT &num_cols, + MKL_INT *&rows_start, MKL_INT *&columns, + value_type *&values); + + inline void destroy() { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_destroy(mtx)); + } +}; + +template <> +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + float *values) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_s_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values)); +} + +template <> +inline MKLSparseMatrix::MKLSparseMatrix(const MKL_INT rows, + const MKL_INT cols, + MKL_INT *xadj, MKL_INT *adj, + double *values) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_d_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, values)); +} + +template <> +inline MKLSparseMatrix>::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + Kokkos::complex *values) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, + reinterpret_cast(values))); +} + +template <> +inline MKLSparseMatrix>::MKLSparseMatrix( + const MKL_INT rows, const MKL_INT cols, MKL_INT *xadj, MKL_INT *adj, + Kokkos::complex *values) { + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_create_csr( + &mtx, SPARSE_INDEX_BASE_ZERO, rows, cols, xadj, xadj + 1, adj, + reinterpret_cast(values))); +} + +template <> +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + float *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_s_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start, + &rows_end, &columns, &values)); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +template <> +inline void MKLSparseMatrix::export_data(MKL_INT &num_rows, + MKL_INT &num_cols, + MKL_INT *&rows_start, + MKL_INT *&columns, + double *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + KOKKOSKERNELS_MKL_SAFE_CALL( + mkl_sparse_d_export_csr(mtx, &indexing, &num_rows, &num_cols, &rows_start, + &rows_end, &columns, &values)); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +template <> +inline void MKLSparseMatrix>::export_data( + MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, + MKL_INT *&columns, Kokkos::complex *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_c_export_csr( + mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, + reinterpret_cast(&values))); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +template <> +inline void MKLSparseMatrix>::export_data( + MKL_INT &num_rows, MKL_INT &num_cols, MKL_INT *&rows_start, + MKL_INT *&columns, Kokkos::complex *&values) { + sparse_index_base_t indexing; + MKL_INT *rows_end; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_z_export_csr( + mtx, &indexing, &num_rows, &num_cols, &rows_start, &rows_end, &columns, + reinterpret_cast(&values))); + if (SPARSE_INDEX_BASE_ZERO != indexing) { + throw std::runtime_error( + "Expected zero based indexing in exported MKL sparse matrix\n"); + return; + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL + +#endif // _KOKKOSKERNELS_SPARSEUTILS_MKL_HPP \ No newline at end of file diff --git a/src/common/KokkosKernels_SparseUtils_rocsparse.hpp b/src/sparse/KokkosSparse_Utils_rocsparse.hpp similarity index 100% rename from src/common/KokkosKernels_SparseUtils_rocsparse.hpp rename to src/sparse/KokkosSparse_Utils_rocsparse.hpp diff --git a/src/sparse/KokkosSparse_csc2csr.hpp b/src/sparse/KokkosSparse_csc2csr.hpp new file mode 100644 index 0000000000..32f0c2b745 --- /dev/null +++ b/src/sparse/KokkosSparse_csc2csr.hpp @@ -0,0 +1,250 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_Utils.hpp" +#include + +#ifndef _KOKKOSSPARSE_CSC2CSR_HPP +#define _KOKKOSSPARSE_CSC2CSR_HPP +namespace KokkosSparse { +namespace Impl { +template +class Csc2Csr { + private: + using CrsST = typename ValViewType::value_type; + using CrsOT = OrdinalType; + using CrsET = typename ValViewType::execution_space; + using CrsMT = void; + using CrsSzT = SizeType; + using CrsType = CrsMatrix; + using CrsValsViewType = typename CrsType::values_type; + using CrsRowMapViewType = typename CrsType::row_map_type::non_const_type; + using CrsColIdViewType = typename CrsType::index_type; + + OrdinalType __nrows; + OrdinalType __ncols; + SizeType __nnz; + ValViewType __vals; + RowIdViewType __row_ids; + ColMapViewType __col_map; + + RowIdViewType __crs_row_cnt; + + CrsValsViewType __crs_vals; + CrsRowMapViewType __crs_row_map; + CrsRowMapViewType __crs_row_map_scratch; + CrsColIdViewType __crs_col_ids; + + public: + struct AlgoTags { + struct s1RowCnt {}; + struct s2RowMap {}; + struct s3Copy {}; + }; + + using s1RowCntTag = typename AlgoTags::s1RowCnt; + using s3CopyTag = typename AlgoTags::s3Copy; + + private: + using TeamPolicyType = Kokkos::TeamPolicy; + + int __suggested_team_size, __suggested_vec_size, __league_size; + + template + void __run(FunctorType &functor) { + // s1RowCntTag + { + Kokkos::parallel_for("Csc2Csr", + Kokkos::RangePolicy(0, __nnz), + functor); + CrsET().fence(); + } + // s2RowMapTag + { + namespace KE = Kokkos::Experimental; + CrsET crsET; + // Use exclusive scan so we can allocate the row map uninitialized and + // avoid accessing device views on the host. + KE::exclusive_scan(crsET, KE::cbegin(__crs_row_cnt), + KE::cend(__crs_row_cnt), KE::begin(__crs_row_map), 0); + CrsET().fence(); + Kokkos::deep_copy(__crs_row_map_scratch, __crs_row_map); + CrsET().fence(); + } + // s3CopyTag + { + TeamPolicyType teamPolicy(__ncols, __suggested_team_size, + __suggested_vec_size); + Kokkos::parallel_for("Csc2Csr", teamPolicy, functor); + CrsET().fence(); + } + // TODO: s3CopySortCompressTag + } + + public: + template + class __Functor { + private: + OrdinalType __nrows; + OrdinalType __ncols; + SizeType __nnz; + ValViewType __vals; + CrsValsViewType __crs_vals; + RowIdViewType __row_ids; + CrsRowMapViewType __crs_row_map; + CrsRowMapViewType __crs_row_map_scratch; + ColMapViewType __col_map; + CrsColIdViewType __crs_col_ids; + RowIdViewType __crs_row_cnt; + + public: + __Functor(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, CrsValsViewType crs_vals, RowIdViewType row_ids, + CrsRowMapViewType crs_row_map, + CrsRowMapViewType crs_row_map_scratch, ColMapViewType col_map, + CrsColIdViewType crs_col_ids, RowIdViewType crs_row_cnt) + : __nrows(nrows), + __ncols(ncols), + __nnz(nnz), + __vals(vals), + __crs_vals(crs_vals), + __row_ids(row_ids), + __crs_row_map(crs_row_map), + __crs_row_map_scratch(crs_row_map_scratch), + __col_map(col_map), + __crs_col_ids(crs_col_ids), + __crs_row_cnt(crs_row_cnt){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const s3CopyTag &, const MemberType &member) const { + auto j = member.league_rank(); + auto col_start = __col_map(j); + auto col_len = __col_map(j + 1) - col_start; + + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, 0, col_len), [&](const int &k) { + auto idx = col_start + k; + auto i = __row_ids(idx); + auto crs_idx = + Kokkos::atomic_fetch_inc(&__crs_row_map_scratch.data()[i]); + __crs_col_ids(crs_idx) = j; + __crs_vals(crs_idx) = __vals(idx); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const s1RowCntTag &, const int &thread_id) const { + Kokkos::atomic_inc(&__crs_row_cnt.data()[__row_ids(thread_id)]); + } + }; + + Csc2Csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, ValViewType vals, + RowIdViewType row_ids, ColMapViewType col_map, int league_size = 2) + : __nrows(nrows), + __ncols(ncols), + __nnz(nnz), + __vals(vals), + __row_ids(row_ids), + __col_map(col_map), + __league_size(league_size) { + __crs_vals = CrsValsViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_vals"), nnz); + __crs_row_map = CrsRowMapViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_row_map"), + nrows + 1); + __crs_row_map_scratch = + CrsRowMapViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "__crs_row_map_scratch"), + nrows + 1); + __crs_col_ids = CrsColIdViewType( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "__crs_col_ids"), nnz); + + __crs_row_cnt = RowIdViewType("__crs_row_cnt", __nrows + 1); + + __Functor functor( + __nrows, __ncols, __nnz, __vals, __crs_vals, __row_ids, __crs_row_map, + __crs_row_map_scratch, __col_map, __crs_col_ids, __crs_row_cnt); + + KokkosKernels::Impl::get_suggested_vector_size( + __suggested_vec_size, __nrows, __nnz); + __suggested_team_size = + KokkosKernels::Impl::get_suggested_team_size( + functor, __suggested_vec_size); + + __run(functor); + } + + CrsType get_csrMat() { + return CrsType("csc2csr", __nrows, __ncols, __nnz, __crs_vals, + __crs_row_map, __crs_col_ids); + } +}; +} // namespace Impl +/// +/// \brief Converts a csc matrix to a CrsMatrix. +/// \tparam OrdinalType The view value type associated with the RowIdViewType +/// \tparam SizeType The type of nnz +/// \tparam ValViewType The values view type +/// \tparam RowIdViewType The row ids view type +/// \tparam ColMapViewType The column map view type +/// \param nrows The number of rows in the csc matrix +/// \param ncols The number of columns in the csc matrix +/// \param nnz The number of non-zeros in the csc matrix +/// \param vals The values view of the csc matrix +/// \param row_ids The row ids view of the csc matrix +/// \param col_map The column map view of the csc matrix +/// \return A KokkosSparse::CrsMatrix. +template +auto csc2csr(OrdinalType nrows, OrdinalType ncols, SizeType nnz, + ValViewType vals, RowIdViewType row_ids, ColMapViewType col_map, + int league_size) { + using Csc2csrType = Impl::Csc2Csr; + Csc2csrType csc2Csr(nrows, ncols, nnz, vals, row_ids, col_map, league_size); + return csc2Csr.get_csrMat(); +} +} // namespace KokkosSparse +#endif // _KOKKOSSPARSE_CSC2CSR_HPP diff --git a/src/sparse/KokkosSparse_gauss_seidel.hpp b/src/sparse/KokkosSparse_gauss_seidel.hpp index efe70dd1c5..1df960860b 100644 --- a/src/sparse/KokkosSparse_gauss_seidel.hpp +++ b/src/sparse/KokkosSparse_gauss_seidel.hpp @@ -132,7 +132,7 @@ void block_gauss_seidel_symbolic( is_graph_symmetric); } -template void gauss_seidel_numeric(KernelHandle *handle, @@ -207,7 +207,7 @@ void gauss_seidel_numeric(KernelHandle *handle, is_graph_symmetric); } -template void gauss_seidel_numeric(KernelHandle *handle, @@ -286,7 +286,7 @@ void gauss_seidel_numeric(KernelHandle *handle, is_graph_symmetric); } -template void block_gauss_seidel_numeric( @@ -307,7 +307,7 @@ void block_gauss_seidel_numeric( values, is_graph_symmetric); } -template @@ -437,7 +437,7 @@ void symmetric_gauss_seidel_apply( update_y_vector, omega, numIter, true, true); } -template @@ -471,7 +471,7 @@ void symmetric_block_gauss_seidel_apply( handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter); } -template @@ -603,7 +603,7 @@ void forward_sweep_gauss_seidel_apply( update_y_vector, omega, numIter, true, false); } -template @@ -637,7 +637,7 @@ void forward_sweep_block_gauss_seidel_apply( handle, num_rows, num_cols, row_map, entries, values, x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, omega, numIter); } -template @@ -769,7 +769,7 @@ void backward_sweep_gauss_seidel_apply( update_y_vector, omega, numIter, false, true); } -template diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 6db63455be..38bead14de 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -46,460 +46,13 @@ #define _KOKKOS_SPADD_HPP #include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_Sorting.hpp" -#include "Kokkos_ArithTraits.hpp" +#include "KokkosKernels_helpers.hpp" +#include "KokkosSparse_spadd_symbolic_spec.hpp" +#include "KokkosSparse_spadd_numeric_spec.hpp" namespace KokkosSparse { namespace Experimental { -/* -Unsorted symbolic algorithm notes: --Only needs to sort and merge indices once, in symbolic (sorting is expensive) --Can't afford to allocate dense Views for indices/values (assume number of -columns is very large) -Want numeric() to know exactly where each A/B entry -belongs in Ccolinds/Cvalues -To accomplish all of these, symbolic() computes -arrays Apos and Bpos (both are type clno_nnz_view_t_, and have same length as -a_entries and b_entries respectively) -Apos/Bpos are saved in the handle -Apos -and Bpos each contain the final index within C row where the A/B entry belongs --See UnsortedNumericSumFunctor below for the usage of Apos/Bpos -*/ - -// Helper macro to check that two types are the same (ignoring const) -#define SAME_TYPE(A, B) \ - std::is_same::type, \ - typename std::remove_const::type>::value - -// get C rowmap for sorted input -template -struct SortedCountEntriesRange { - SortedCountEntriesRange(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const AColIndsT& Acolinds_, - const typename BRowPtrsT::const_type& Browptrs_, - const BColIndsT& Bcolinds_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowcounts(Crowcounts_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type numEntries = 0; - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - numEntries++; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - while (Acol == Ccol) - Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); - while (Bcol == Ccol) - Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); - } - Crowcounts(i) = numEntries; - } - - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const AColIndsT Acolinds; - const typename BRowPtrsT::const_type Browptrs; - const BColIndsT Bcolinds; - CRowPtrsT Crowcounts; -}; - -template -struct SortedCountEntriesTeam { - SortedCountEntriesTeam(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const AColIndsT& Acolinds_, - const typename BRowPtrsT::const_type& Browptrs_, - const BColIndsT& Bcolinds_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowcounts(Crowcounts_) {} - - using TeamPol = Kokkos::TeamPolicy; - using TeamMem = typename TeamPol::member_type; - - KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type numEntries = 0; - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - numEntries++; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - while (Acol == Ccol) - Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); - while (Bcol == Ccol) - Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); - } - Crowcounts(i) = numEntries; - } - - KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { - ordinal_type i = t.league_rank() * t.team_size() + t.team_rank(); - if (i >= nrows) return; - ordinal_type* allScratch = - (ordinal_type*)t.team_shmem().get_shmem(totalShared); - ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread; - ordinal_type Arowstart = Arowptrs(i); - ordinal_type Arowlen = Arowptrs(i + 1) - Arowstart; - ordinal_type Browstart = Browptrs(i); - ordinal_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type n = Arowlen + Browlen; - if (n > sharedPerThread) { - // fall back to slow serial method - Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); }); - return; - } - if (n == 0) { - Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; }); - return; - } - // Figure out the number of bitonic steps: ceil(log2(n)) - ordinal_type npot = 1; - ordinal_type levels = 0; - while (npot < n) { - levels++; - npot <<= 1; - } - // Copy A and B entries to scratch - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(t, Arowlen), - [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); }); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen), - [&](ordinal_type j) { - scratch[npot - 1 - j] = Bcolinds(Browstart + j); - }); - // Fill space between A and B with ORDINAL_MAX, - // to maintain a valid bitonic sequence of power-of-two length - Kokkos::parallel_for( - Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) { - scratch[Arowlen + j] = Kokkos::ArithTraits::max(); - }); - // npot = 2^levels - for (ordinal_type level = 0; level < levels; level++) { - // npot/2 pairs of items are compared in parallel - Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1), - [&](const ordinal_type j) { - ordinal_type boxSize = npot >> level; - // Which box contains this thread? - // box = (j / boxSize), and boxSize = - // 2^(levels-level), so box = j * 2^(level-levels) - // = j >> (levels - level) - ordinal_type boxID = (j * 2) >> (levels - level); - // boxStart = boxID * boxSize = boxID * - // 2^(levels-level) = boxID << (levels-level) - ordinal_type boxStart = boxID << (levels - level); - ordinal_type boxOffset = j - boxID * boxSize / 2; - ordinal_type elem1 = boxStart + boxOffset; - ordinal_type elem2 = elem1 + (boxSize >> 1); - if (scratch[elem2] < scratch[elem1]) { - ordinal_type temp = scratch[elem1]; - scratch[elem1] = scratch[elem2]; - scratch[elem2] = temp; - } - }); - } - // Finally, count the number of distinct entries (this is #rising edges + 1) - ordinal_type risingEdges; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(t, n - 1), - [&](const ordinal_type j, ordinal_type& lcount) { - if (scratch[j] != scratch[j + 1]) lcount++; - }, - risingEdges); - Kokkos::single(Kokkos::PerThread(t), - [&]() { Crowcounts(i) = risingEdges + 1; }); - } - - size_t team_shmem_size(int teamSize) const { - return sharedPerThread * sizeof(ordinal_type) * teamSize; - } - - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const AColIndsT Acolinds; - const typename BRowPtrsT::const_type Browptrs; - const BColIndsT Bcolinds; - CRowPtrsT Crowcounts; - int sharedPerThread; // Shared for each thread, measured in - // sizeof(ordinal_type) - int totalShared; // Shared for whole team, measured in bytes -}; - -// get upper bound for C entries per row (assumes worst case, that entries in A -// and B on each row are disjoint) -template -struct UnsortedEntriesUpperBound { - UnsortedEntriesUpperBound(ordinal_type nrows_, - const typename ARowPtrsT::const_type& Arowptrs_, - const typename BRowPtrsT::const_type& Browptrs_, - const CRowPtrsT& Crowcounts_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowcounts(Crowcounts_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - Crowcounts(i) = - (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i)); - if (i == nrows - 1) { - // last workitem also zeros the one-past-end entry of row counts, so - // that prefix sum is correct - Crowcounts(nrows) = 0; - } - } - ordinal_type nrows; - const typename ARowPtrsT::const_type Arowptrs; - const typename BRowPtrsT::const_type Browptrs; - CRowPtrsT Crowcounts; -}; - -// Unsorted symbolic: new functors: -// -compute uncompressed C (entries only, no values) -// -sort uncompressed C entries within row, while permuting A union B -// permutation array -compress sorted C entries and A,B perm arrays at the same -// time, which produces Crowcounts value -// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C -// entries) Output: C uncompressed colinds -template -struct UnmergedSumFunctor { - UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, - const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_, - const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_, - const CcolindsT& Ccolinds_, const CcolindsT& ABperm_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Acolinds(Acolinds_), - Browptrs(Browptrs_), - Bcolinds(Bcolinds_), - Crowptrs(Crowptrs_), - Ccolinds(Ccolinds_), - ABperm(ABperm_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type inserted = 0; - size_type crowstart = Crowptrs(i); - size_type arowstart = Arowptrs(i); - size_type arowlen = Arowptrs(i + 1) - arowstart; - size_type browstart = Browptrs(i); - size_type browlen = Browptrs(i + 1) - browstart; - // Insert all A entries, then all B entries - for (size_type j = 0; j < arowlen; j++) { - Ccolinds(crowstart + inserted) = Acolinds(arowstart + j); - ABperm(crowstart + inserted) = j; - inserted++; - } - for (size_type j = 0; j < browlen; j++) { - Ccolinds(crowstart + inserted) = Bcolinds(browstart + j); - // tell A and B permutation values apart by adding arowlen as a bias to B - // values - ABperm(crowstart + inserted) = j + arowlen; - inserted++; - } - } - ordinal_type nrows; - const ArowptrsT Arowptrs; - const AcolindsT Acolinds; - const BrowptrsT Browptrs; - const BcolindsT Bcolinds; - const CrowptrsT Crowptrs; - CcolindsT Ccolinds; - CcolindsT ABperm; -}; - -template -struct MergeEntriesFunctor { - MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, - const BrowptrsT& Browptrs_, const CrowptrsT& Crowptrs_, - const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_, - const CcolindsT& ABperm_, const CcolindsT& Apos_, - const CcolindsT& Bpos_) - : nrows(nrows_), - Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Crowcounts(Crowcounts_), - Ccolinds(Ccolinds_), - ABperm(ABperm_), - Apos(Apos_), - Bpos(Bpos_) {} - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type CrowStart = Crowptrs(i); - size_type CrowEnd = Crowptrs(i + 1); - if (CrowEnd == CrowStart) { - Crowcounts(i) = 0; - return; - } - size_type ArowStart = Arowptrs(i); - size_type ArowNum = Arowptrs(i + 1) - ArowStart; - size_type BrowStart = Browptrs(i); - ordinal_type CFit = 0; // counting through merged C indices (within row) - for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) { - if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) { - // This is a different column than the previous entry, and is not the - // first entry. This means that this is the first occurence of a unique - // column. - CFit++; - } - size_type permVal = ABperm(Cit); - if (permVal < ArowNum) { - // Entry belongs to A - ordinal_type Aindex = permVal; - // The Aindex'th entry in row i of A will be added into the CFit'th - // entry in C - Apos(ArowStart + Aindex) = CFit; - } else { - // Entry belongs to B - ordinal_type Bindex = permVal - ArowNum; - // The Bindex'th entry in row i of B will be added into the CFit'th - // entry in C - Bpos(BrowStart + Bindex) = CFit; - } - } - // At end of the row, know how many entries are in merged C. - // Right now, CFit is the index of the last Apos/Bpos, - // so adding one gives the total number of entries. - Crowcounts(i) = CFit + 1; - } - ordinal_type nrows; - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - CrowptrsT Crowcounts; - CcolindsT Ccolinds; - const CcolindsT ABperm; - CcolindsT Apos; - CcolindsT Bpos; -}; - -// Run SortedCountEntries: non-GPU, always uses the RangePolicy version. -template -void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = - nullptr) { - using size_type = typename KernelHandle::size_type; - using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using range_type = Kokkos::RangePolicy; - auto nrows = c_rowmap.extent(0) - 1; - SortedCountEntriesRange - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); -} - -// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending -// on average nz per row (a runtime decision) -template -void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = - nullptr) { - using size_type = typename KernelHandle::size_type; - using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using RangePol = Kokkos::RangePolicy; - using TeamPol = Kokkos::TeamPolicy; - auto nrows = c_rowmap.extent(0) - 1; - size_type c_est_nnz = - 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; - if (c_est_nnz <= 512) { - // Convert c_est_nnz to a power of 2 - size_type pot_est_nnz = 1; - while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2; - // Estimate max number of uncompressed entries in each row of C - int vector_length = 1; - int vector_length_max = - KokkosKernels::Impl::kk_get_max_vector_size(); - while (vector_length * 2 <= vector_length_max && - (size_type)vector_length * 2 <= pot_est_nnz) { - vector_length *= 2; - } - SortedCountEntriesTeam - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - countEntries.sharedPerThread = pot_est_nnz; - // compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); - testPolicy.set_scratch_size( - 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - int team_size = testPolicy.team_size_recommended(countEntries, - Kokkos::ParallelForTag()); - // construct real policy - int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); - policy.set_scratch_size( - 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - countEntries.totalShared = - countEntries.sharedPerThread * team_size * sizeof(ordinal_type); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy, - countEntries); - } else { - SortedCountEntriesRange - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - RangePol(0, nrows), countEntries); - } -} - // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. template ::value, - "add_symbolic: C size_type must not be const"); - static_assert( - SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: A entry type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: B entry type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename clno_nnz_view_t_::non_const_value_type, ordinal_type), - "add_symbolic: C entry type must match KernelHandle entry type (aka " - "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C entry type must not be const"); - // symbolic just needs to compute c_rowmap - // easy for sorted, but for unsorted is easiest to just compute the whole sum - auto addHandle = handle->get_spadd_handle(); - if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { - // Have 0 rows, so nothing to do except set #nnz to 0 - addHandle->set_c_nnz(0); - // If c_rowmap has a single entry, it must be 0 - if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0); - addHandle->set_call_symbolic(); - return; - } - ordinal_type nrows = a_rowmap.extent(0) - 1; - typedef Kokkos::RangePolicy range_type; - if (addHandle->is_input_sorted()) { - runSortedCountEntries( - a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); - } else { - // note: scoping individual parts of the process to free views sooner, - // minimizing peak memory usage run the unsorted c_rowmap upper bound - // functor (just adds together A and B entry counts row by row) - clno_row_view_t_ c_rowmap_upperbound( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "C row counts upper bound"), - nrows + 1); - size_type c_nnz_upperbound = 0; - { - UnsortedEntriesUpperBound - countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", - range_type(0, nrows), countEntries); - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(c_nnz_upperbound, - Kokkos::subview(c_rowmap_upperbound, nrows)); - } - clno_nnz_view_t_ c_entries_uncompressed( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "C entries uncompressed"), - c_nnz_upperbound); - clno_nnz_view_t_ ab_perm( - Kokkos::view_alloc(Kokkos::WithoutInitializing, - "A and B permuted entry indices"), - c_nnz_upperbound); - // compute the unmerged sum - UnmergedSumFunctor - unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries, - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", - range_type(0, nrows), unmergedSum); - // sort the unmerged sum - KokkosKernels::sort_crs_matrix( - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - clno_nnz_view_t_ a_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), - a_entries.extent(0)); - clno_nnz_view_t_ b_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), - b_entries.extent(0)); - // merge the entries and compute Apos/Bpos, as well as Crowcounts - { - MergeEntriesFunctor - mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap, - c_entries_uncompressed, ab_perm, a_pos, b_pos); - Kokkos::parallel_for( - "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", - range_type(0, nrows), mergeEntries); - // compute actual c_rowmap - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); - } - addHandle->set_a_b_pos(a_pos, b_pos); - } - // provide the number of NNZ in C to user through handle - size_type cmax; - Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); - addHandle->set_c_nnz(cmax); - addHandle->set_call_symbolic(); - addHandle->set_call_numeric(false); - // this fence is for accurate timing from host - execution_space().fence(); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_rowmap; + KokkosSparse::Impl::SPADD_SYMBOLIC:: + spadd_symbolic(&tmp_handle, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); } -template -struct SortedNumericSumFunctor { - using CscalarT = typename CvaluesT::non_const_value_type; - - SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, - const BrowptrsT& Browptrs_, - const CrowptrsT& Crowptrs_, - const AcolindsT& Acolinds_, - const BcolindsT& Bcolinds_, - const CcolindsT& Ccolinds_, const AvaluesT& Avalues_, - const BvaluesT& Bvalues_, const CvaluesT& Cvalues_, - const AscalarT alpha_, const BscalarT beta_) - : Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Acolinds(Acolinds_), - Bcolinds(Bcolinds_), - Ccolinds(Ccolinds_), - Avalues(Avalues_), - Bvalues(Bvalues_), - Cvalues(Cvalues_), - alpha(alpha_), - beta(beta_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); - - // count the union of nonzeros in Arow and Brow - size_type ai = 0; - size_type bi = 0; - size_type Arowstart = Arowptrs(i); - size_type Arowlen = Arowptrs(i + 1) - Arowstart; - size_type Browstart = Browptrs(i); - size_type Browlen = Browptrs(i + 1) - Browstart; - ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); - ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); - size_type Coffset = Crowptrs(i); - while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { - ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; - // Eat all entries in both A and B which have this column - // This also results in Acol/Bcol being updated to following entries for - // next loop iter - CscalarT accum = Kokkos::ArithTraits::zero(); - while (Acol == Ccol) { - accum += static_cast(alpha * Avalues(Arowstart + ai)); - ai++; - if (ai == Arowlen) - Acol = ORDINAL_MAX; - else - Acol = Acolinds(Arowstart + ai); - } - while (Bcol == Ccol) { - accum += static_cast(beta * Bvalues(Browstart + bi)); - bi++; - if (bi == Browlen) - Bcol = ORDINAL_MAX; - else - Bcol = Bcolinds(Browstart + bi); - } - Ccolinds(Coffset) = Ccol; - Cvalues(Coffset) = accum; - Coffset++; - } - } - - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - const AcolindsT Acolinds; - const BcolindsT Bcolinds; - CcolindsT Ccolinds; - const AvaluesT Avalues; - const BvaluesT Bvalues; - CvaluesT Cvalues; - const AscalarT alpha; - const BscalarT beta; -}; - -template -struct UnsortedNumericSumFunctor { - using CscalarT = typename CvaluesT::non_const_value_type; - - UnsortedNumericSumFunctor( - const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_, - const CrowptrsT Crowptrs_, const AcolindsT Acolinds_, - const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_, - const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_, - const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_) - : Arowptrs(Arowptrs_), - Browptrs(Browptrs_), - Crowptrs(Crowptrs_), - Acolinds(Acolinds_), - Bcolinds(Bcolinds_), - Ccolinds(Ccolinds_), - Avalues(Avalues_), - Bvalues(Bvalues_), - Cvalues(Cvalues_), - alpha(alpha_), - beta(beta_), - Apos(Apos_), - Bpos(Bpos_) {} - - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { - size_type CrowStart = Crowptrs(i); - size_type CrowEnd = Crowptrs(i + 1); - size_type ArowStart = Arowptrs(i); - size_type ArowEnd = Arowptrs(i + 1); - size_type BrowStart = Browptrs(i); - size_type BrowEnd = Browptrs(i + 1); - for (size_type j = CrowStart; j < CrowEnd; j++) - Cvalues(j) = Kokkos::ArithTraits::zero(); - // add in A entries, while setting C colinds - for (size_type j = ArowStart; j < ArowEnd; j++) { - Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j); - Ccolinds(CrowStart + Apos(j)) = Acolinds(j); - } - // add in B entries, while setting C colinds - for (size_type j = BrowStart; j < BrowEnd; j++) { - Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j); - Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j); - } - } - const ArowptrsT Arowptrs; - const BrowptrsT Browptrs; - const CrowptrsT Crowptrs; - const AcolindsT Acolinds; - const BcolindsT Bcolinds; - CcolindsT Ccolinds; - const AvaluesT Avalues; - const BvaluesT Bvalues; - CvaluesT Cvalues; - const AscalarT alpha; - const BscalarT beta; - const CcolindsT Apos; - const CcolindsT Bpos; -}; - template -void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap, +void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha, const blno_row_view_t_ b_rowmap, @@ -802,89 +130,81 @@ void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap, const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta, const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries, cscalar_nnz_view_t_ c_values) { - typedef typename KernelHandle::size_type size_type; - typedef typename KernelHandle::nnz_lno_t ordinal_type; - typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; - // Check that A/B/C data types match KernelHandle types, and that C data types - // are nonconst (doesn't matter if A/B types are const) - static_assert(SAME_TYPE(ascalar_t_, scalar_type), - "A scalar type must match handle scalar type"); - static_assert(SAME_TYPE(bscalar_t_, scalar_type), - "B scalar type must match handle scalar type"); - static_assert(SAME_TYPE(typename alno_row_view_t_::value_type, size_type), - "add_symbolic: A size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_row_view_t_::value_type, size_type), - "add_symbolic: B size_type must match KernelHandle size_type " - "(const doesn't matter)"); - static_assert( - SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), - "add_symbolic: C size_type must match KernelHandle size_type)"); - static_assert(SAME_TYPE(typename alno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: A entry type must match KernelHandle entry type " - "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename blno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: B entry type must match KernelHandle entry type " - "(aka nnz_lno_t, and const doesn't matter)"); - static_assert(SAME_TYPE(typename clno_nnz_view_t_::value_type, ordinal_type), - "add_symbolic: C entry type must match KernelHandle entry type " - "(aka nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C entry type must not be const"); - static_assert( - SAME_TYPE(typename ascalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: A scalar type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename bscalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: B scalar type must match KernelHandle entry type (aka " - "nnz_lno_t, and const doesn't matter)"); - static_assert( - SAME_TYPE(typename cscalar_nnz_view_t_::value_type, scalar_type), - "add_symbolic: C scalar type must match KernelHandle entry type (aka " - "nnz_lno_t)"); - static_assert(std::is_same::value, - "add_symbolic: C scalar type must not be const"); - typedef Kokkos::RangePolicy range_type; - auto addHandle = kernel_handle->get_spadd_handle(); - // rowmap length can be 0 or 1 if #rows is 0. - // Otherwise, it's always #rows+1. - if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { - addHandle->set_call_numeric(); - return; - } - ordinal_type nrows = a_rowmap.extent(0) - 1; - if (addHandle->is_input_sorted()) { - SortedNumericSumFunctor< - size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_, - clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_, - ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, - ascalar_t_, bscalar_t_> - sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, - c_entries, a_values, b_values, c_values, alpha, beta); - Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", - range_type(0, nrows), sortedNumeric); - } else { - // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C - // entries and values - UnsortedNumericSumFunctor< - size_type, ordinal_type, alno_row_view_t_, blno_row_view_t_, - clno_row_view_t_, alno_nnz_view_t_, blno_nnz_view_t_, clno_nnz_view_t_, - ascalar_nnz_view_t_, bscalar_nnz_view_t_, cscalar_nnz_view_t_, - ascalar_t_, bscalar_t_> - unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, - c_entries, a_values, b_values, c_values, alpha, beta, - addHandle->get_a_pos(), addHandle->get_b_pos()); - Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", - range_type(0, nrows), unsortedNumeric); - } - addHandle->set_call_numeric(); - // this fence is for accurate timing from host - execution_space().fence(); + typedef typename KernelHandle::HandleExecSpace ExecSpace; + typedef typename KernelHandle::HandleTempMemorySpace MemSpace; + typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; + typedef typename Kokkos::Device DeviceType; + + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< + c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> + ConstKernelHandle; + ConstKernelHandle tmp_handle(*handle); + + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_a_values; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_b_values; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_rowmap; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_entries; + typedef Kokkos::View::array_layout, + DeviceType, Kokkos::MemoryTraits > + Internal_c_values; + KokkosSparse::Impl::SPADD_NUMERIC:: + spadd_numeric(&tmp_handle, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); } } // namespace Experimental diff --git a/src/sparse/KokkosSparse_spgemm.hpp b/src/sparse/KokkosSparse_spgemm.hpp index bdf4d0da75..0cee2979a2 100644 --- a/src/sparse/KokkosSparse_spgemm.hpp +++ b/src/sparse/KokkosSparse_spgemm.hpp @@ -81,6 +81,47 @@ void spgemm_symbolic(KernelHandle& kh, const AMatrix& A, const bool Amode, entriesC); } +// Symbolic phase for block SpGEMM (BSR matrices) +template +void block_spgemm_symbolic(KernelHandle& kh, const AMatrixType& A, + const bool transposeA, const BMatrixType& B, + const bool transposeB, CMatrixType& C) { + using row_map_type = typename CMatrixType::row_map_type::non_const_type; + using entries_type = typename CMatrixType::index_type::non_const_type; + using values_type = typename CMatrixType::values_type::non_const_type; + + auto blockDim = A.blockDim(); + if (blockDim != B.blockDim()) { + throw std::invalid_argument( + "Block SpGEMM must be called for matrices with the same block size"); + } + + row_map_type row_mapC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "non_const_lnow_row"), + A.numRows() + 1); + + KokkosSparse::Experimental::spgemm_symbolic( + &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map, + A.graph.entries, transposeA, B.graph.row_map, B.graph.entries, transposeB, + row_mapC); + + entries_type entriesC; + values_type valuesC; + const size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + if (c_nnz_size) { + entriesC = entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC = + values_type(Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), + c_nnz_size * blockDim * blockDim); + } + + C = CMatrixType("C=AB", A.numRows(), B.numCols(), c_nnz_size, valuesC, + row_mapC, entriesC, blockDim); +} + template void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, const BMatrix& B, const bool Bmode, CMatrix& C) { @@ -94,6 +135,21 @@ void spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, B.values, Bmode, C.graph.row_map, C.graph.entries, C.values); } +template +void block_spgemm_numeric(KernelHandle& kh, const AMatrix& A, const bool Amode, + const BMatrix& B, const bool Bmode, CMatrix& C) { + auto blockDim = A.blockDim(); + if (blockDim != B.blockDim() or blockDim != C.blockDim()) { + throw std::invalid_argument( + "Block SpGEMM must be called for matrices with the same block size"); + } + + KokkosSparse::Experimental::spgemm_numeric( + &kh, A.numRows(), B.numRows(), B.numCols(), A.graph.row_map, + A.graph.entries, A.values, Amode, B.graph.row_map, B.graph.entries, + B.values, Bmode, C.graph.row_map, C.graph.entries, C.values, blockDim); +} + } // namespace KokkosSparse #endif diff --git a/src/sparse/KokkosSparse_spgemm_numeric.hpp b/src/sparse/KokkosSparse_spgemm_numeric.hpp index 60a54f5b8b..313922dc62 100644 --- a/src/sparse/KokkosSparse_spgemm_numeric.hpp +++ b/src/sparse/KokkosSparse_spgemm_numeric.hpp @@ -46,11 +46,18 @@ #include "KokkosKernels_helpers.hpp" #include "KokkosSparse_spgemm_numeric_spec.hpp" +#include "KokkosSparse_bspgemm_numeric_spec.hpp" namespace KokkosSparse { namespace Experimental { +// +// NOTE: block_dim = 1 for CRS-formated views +// block_dim >= 1 for BSR-formatted views (bs=1 BSR is CRS) +// +// NOTE: Block CRS format is not yet supported ! +// template ::value, @@ -139,7 +148,9 @@ void spgemm_numeric(KernelHandle *handle, "If you need this case please let kokkos-kernels developers know.\n"); } - if (m < 1 || n < 1 || k < 1) return; + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) + return; typedef typename KernelHandle::const_size_type c_size_t; typedef typename KernelHandle::const_nnz_lno_t c_lno_t; @@ -240,6 +251,23 @@ void spgemm_numeric(KernelHandle *handle, Internal_clno_nnz_view_t_ nonconst_c_l(entriesC.data(), entriesC.extent(0)); Internal_cscalar_nnz_view_t_ nonconst_c_s(valuesC.data(), valuesC.extent(0)); + if (block_dim > 1) { + KokkosSparse::Impl::BSPGEMM_NUMERIC< + const_handle_type, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, + Internal_ascalar_nnz_view_t_, Internal_blno_row_view_t_, + Internal_blno_nnz_view_t_, Internal_bscalar_nnz_view_t_, + Internal_clno_row_view_t_, Internal_clno_nnz_view_t_, + Internal_cscalar_nnz_view_t_>::bspgemm_numeric(&tmp_handle, m, n, k, + block_dim, const_a_r, + const_a_l, const_a_s, + transposeA, const_b_r, + const_b_l, const_b_s, + transposeB, nonconst_c_r, + nonconst_c_l, + nonconst_c_s); + return; + } + KokkosSparse::Impl::SPGEMM_NUMERIC< const_handle_type, // KernelHandle, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, diff --git a/src/sparse/KokkosSparse_spiluk_handle.hpp b/src/sparse/KokkosSparse_spiluk_handle.hpp index 3cabcd0f73..54cc124474 100644 --- a/src/sparse/KokkosSparse_spiluk_handle.hpp +++ b/src/sparse/KokkosSparse_spiluk_handle.hpp @@ -45,6 +45,7 @@ #include #include #include +#include #ifndef _SPILUKHANDLE_HPP #define _SPILUKHANDLE_HPP @@ -87,6 +88,12 @@ class SPILUKHandle { typedef typename Kokkos::View nnz_lno_view_t; + typedef typename Kokkos::View + nnz_row_view_host_t; + + typedef typename Kokkos::View + nnz_lno_view_host_t; + typedef typename std::make_signed< typename nnz_row_view_t::non_const_value_type>::type signed_integral_t; typedef Kokkos::View signed_nnz_lno_view_t; + typedef Kokkos::View + work_view_t; + private: nnz_row_view_t level_list; // level IDs which the rows belong to nnz_lno_view_t level_idx; // the list of rows in each level nnz_lno_view_t level_ptr; // the starting index (into the view level_idx) of each level - nnz_lno_view_t level_nchunks; // number of chunks of rows at each level - nnz_lno_view_t + nnz_lno_view_host_t level_nchunks; // number of chunks of rows at each level + nnz_lno_view_host_t level_nrowsperchunk; // maximum number of rows among chunks at each level + work_view_t iw; // working view for mapping dense indices to sparse indices size_type nrows; size_type nlevels; @@ -128,6 +140,7 @@ class SPILUKHandle { level_ptr(), level_nchunks(), level_nrowsperchunk(), + iw(), nrows(nrows_), nlevels(0), nnzL(nnzL_), @@ -147,11 +160,12 @@ class SPILUKHandle { set_nnzU(nnzU_); set_level_maxrows(0); set_level_maxrowsperchunk(0); - level_list = nnz_row_view_t("level_list", nrows_), - level_idx = nnz_lno_view_t("level_idx", nrows_), - level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), - level_nchunks = nnz_lno_view_t(), level_nrowsperchunk = nnz_lno_view_t(), - reset_symbolic_complete(); + level_list = nnz_row_view_t("level_list", nrows_), + level_idx = nnz_lno_view_t("level_idx", nrows_), + level_ptr = nnz_lno_view_t("level_ptr", nrows_ + 1), + level_nchunks = nnz_lno_view_host_t(), + level_nrowsperchunk = nnz_lno_view_host_t(), reset_symbolic_complete(), + iw = work_view_t(); } virtual ~SPILUKHandle(){}; @@ -170,17 +184,28 @@ class SPILUKHandle { nnz_lno_view_t get_level_ptr() const { return level_ptr; } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nchunks() const { return level_nchunks; } + nnz_lno_view_host_t get_level_nchunks() const { return level_nchunks; } void alloc_level_nchunks(const size_type nlevels_) { - level_nchunks = nnz_lno_view_t("level_nchunks", nlevels_); + level_nchunks = nnz_lno_view_host_t("level_nchunks", nlevels_); } KOKKOS_INLINE_FUNCTION - nnz_lno_view_t get_level_nrowsperchunk() const { return level_nrowsperchunk; } + nnz_lno_view_host_t get_level_nrowsperchunk() const { + return level_nrowsperchunk; + } void alloc_level_nrowsperchunk(const size_type nlevels_) { - level_nrowsperchunk = nnz_lno_view_t("level_nrowsperchunk", nlevels_); + level_nrowsperchunk = nnz_lno_view_host_t("level_nrowsperchunk", nlevels_); + } + + KOKKOS_INLINE_FUNCTION + work_view_t get_iw() const { return iw; } + + void alloc_iw(const size_type nrows_, const size_type ncols_) { + iw = work_view_t(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), + nrows_, ncols_); + Kokkos::deep_copy(iw, nnz_lno_t(-1)); } KOKKOS_INLINE_FUNCTION @@ -238,8 +263,7 @@ class SPILUKHandle { if (algm == SPILUKAlgorithm::SEQLVLSCHD_TP1) std::cout << "SEQLVLSCHD_TP1" << std::endl; - /* - if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { + /*if ( algm == SPILUKAlgorithm::SEQLVLSCHED_TP2 ) { std::cout << "SEQLVLSCHED_TP2" << std::endl;; std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl; diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 8ec7799e16..95860029f1 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -662,9 +662,10 @@ template ::value>::type* = nullptr> #endif -void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, const RANK_TWO) { + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), @@ -752,21 +753,50 @@ void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], XVector_Internal x_i = x; YVector_Internal y_i = y; - return Impl::SPMV_MV< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i, - x_i, beta, y_i); + bool useNative = false; + +// cusparseSpMM does not support conjugate mode +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + useNative = useNative || (Conjugate[0] == mode[0]); +#endif + useNative = useNative || (controls.isParameter("algorithm") && + (controls.getParameter("algorithm") == "native")); + + if (useNative) { + return Impl::SPMV_MV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits, + std::is_integral::value, + false>::spmv_mv(controls, mode, alpha, A_i, x_i, beta, y_i); + } else { + return Impl::SPMV_MV< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits>::spmv_mv(controls, mode, + alpha, A_i, x_i, + beta, y_i); + } } } @@ -894,8 +924,10 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available - bool useFallback = controls.isParameter("algorithm") && - controls.getParameter("algorithm") == "native"; + bool useFallback = + controls.isParameter("algorithm") && + (controls.getParameter("algorithm") == "native" || + controls.getParameter("algorithm") == "experimental_bsr_tc"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) @@ -936,6 +968,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::array_layout, typename YVector_Internal::device_type, typename YVector_Internal::memory_traits, + std::is_integral::value, false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { @@ -952,11 +985,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::value_type**, typename YVector_Internal::array_layout, typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls, - mode, - alpha, A_i, - x_i, beta, - y_i); + typename YVector_Internal::memory_traits, + std::is_integral::value>:: + spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); } } @@ -1072,12 +1103,12 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], } // return Experimental::Impl::SPMV_MV_BLOCKCRSMATRIX< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::const_value_type, + typename AMatrix_Internal::const_ordinal_type, typename AMatrix_Internal::device_type, typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, + typename AMatrix_Internal::const_size_type, + typename XVector_Internal::const_value_type**, typename XVector_Internal::array_layout, typename XVector_Internal::device_type, typename XVector_Internal::memory_traits, @@ -1097,7 +1128,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// entries of y; if alpha == 0, ignore the entries of A and x. /// /// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on +/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on /// Volta or Ampere architectures. On Volta-architecture GPUs the only available /// precision is mixed-precision fp32 accumulator from fp16 inputs. On /// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16, @@ -1530,8 +1561,9 @@ void spmv_struct(const char mode[], const int stencil_type, typename YVector_Internal::value_type**, typename YVector_Internal::array_layout, typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv(mode, alpha, A_i, - x_i, beta, y_i); + typename YVector_Internal::memory_traits>:: + spmv_mv(KokkosKernels::Experimental::Controls(), mode, alpha, A_i, x_i, + beta, y_i); } } diff --git a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp index 796ee579bd..6d354047cf 100644 --- a/src/sparse/KokkosSparse_sptrsv_cholmod.hpp +++ b/src/sparse/KokkosSparse_sptrsv_cholmod.hpp @@ -56,7 +56,7 @@ defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) #include "cholmod.h" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_sptrsv_supernode.hpp" namespace KokkosSparse { diff --git a/src/sparse/KokkosSparse_sptrsv_supernode.hpp b/src/sparse/KokkosSparse_sptrsv_supernode.hpp index fa9a607be7..481bd2cc0a 100644 --- a/src/sparse/KokkosSparse_sptrsv_supernode.hpp +++ b/src/sparse/KokkosSparse_sptrsv_supernode.hpp @@ -63,7 +63,7 @@ #include "KokkosBatched_Trmm_Decl.hpp" #include "KokkosBatched_Trmm_Serial_Impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "KokkosSparse_sptrsv.hpp" namespace KokkosSparse { @@ -597,8 +597,8 @@ host_graph_t generate_supernodal_graph(bool col_major, graph_t &graph, #endif // sort column ids per row - KokkosKernels::sort_crs_graph(hr, hc); + KokkosSparse::sort_crs_graph(hr, hc); #ifdef KOKKOS_SPTRSV_SUPERNODE_PROFILE time_seconds = timer.seconds(); std::cout << " > Generate Supernodal Graph: sort graph : " diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp new file mode 100644 index 0000000000..7b003229ab --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl.hpp @@ -0,0 +1,198 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSBSPGEMMIMPL_HPP +#define _KOKKOSBSPGEMMIMPL_HPP + +#include "KokkosSparse_spgemm_impl.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +class KokkosBSPGEMM + : public KokkosSPGEMM { + public: + using Base = KokkosSparse::Impl::KokkosSPGEMM< + HandleType, a_row_view_t_, a_lno_nnz_view_t_, a_scalar_nnz_view_t_, + b_lno_row_view_t_, b_lno_nnz_view_t_, b_scalar_nnz_view_t_>; + +#define USE_BASE_TYPE(type) using type = typename Base::type; + + USE_BASE_TYPE(nnz_lno_t) + USE_BASE_TYPE(scalar_t) + USE_BASE_TYPE(size_type) + USE_BASE_TYPE(const_a_lno_row_view_t) + USE_BASE_TYPE(const_a_lno_nnz_view_t) + USE_BASE_TYPE(const_a_scalar_nnz_view_t) + USE_BASE_TYPE(const_b_lno_row_view_t) + USE_BASE_TYPE(const_b_lno_nnz_view_t) + USE_BASE_TYPE(const_b_scalar_nnz_view_t) + USE_BASE_TYPE(row_lno_persistent_work_view_t) + USE_BASE_TYPE(nnz_lno_temp_work_view_t) + USE_BASE_TYPE(team_member_t) + + USE_BASE_TYPE(MyExecSpace) + USE_BASE_TYPE(MyTempMemorySpace) + USE_BASE_TYPE(MultiCoreTag) + USE_BASE_TYPE(MultiCoreTag4) + USE_BASE_TYPE(GPUTag) + USE_BASE_TYPE(GPUTag4) + USE_BASE_TYPE(GPUTag6) + USE_BASE_TYPE(gpu_team_policy_t) + USE_BASE_TYPE(gpu_team_policy4_t) + USE_BASE_TYPE(gpu_team_policy6_t) + USE_BASE_TYPE(dynamic_multicore_team_policy_t) + USE_BASE_TYPE(dynamic_multicore_team_policy4_t) + USE_BASE_TYPE(multicore_team_policy_t) + USE_BASE_TYPE(multicore_team_policy4_t) + + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO for SPEED SPGEMM + ////DECL IS AT _speed.hpp + ////////////////////////////////////////////////////////////////////////// + template + struct NumericCMEM_CPU; + + template + struct NumericCMEM; + + private: + /** + * \brief Numeric phase with speed method + */ + template + void KokkosBSPGEMM_numeric_speed( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + + private: + // How many extra bytes are needed to align a scalar_t after an array of + // nnz_lno_t, in the worst case? Incurred once per hashmap, which may be per + // team or per thread depending on algorithm + static constexpr size_t scalarAlignPad = + (alignof(scalar_t) > alignof(nnz_lno_t)) + ? (alignof(scalar_t) - alignof(nnz_lno_t)) + : 0; + + static constexpr bool exec_gpu = + KokkosKernels::Impl::kk_is_gpu_exec_space(); + + private: + nnz_lno_t block_dim; + + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS TO for kkmem SPGEMM + ////DECL IS AT _kkmem.hpp + ////////////////////////////////////////////////////////////////////////// + template + struct PortableNumericCHASH; + + template + void KokkosBSPGEMM_numeric_hash( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space); + + public: + ////////////////////////////////////////////////////////////////////////// + /////BELOW CODE IS for public symbolic and numeric functions + ////DECL IS AT _def.hpp + ////////////////////////////////////////////////////////////////////////// + template + void KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, + c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_); + + KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, bool transposeB_) + : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, transposeA_, row_mapB_, + entriesB_, transposeB_), + block_dim(block_dim_) {} + + KokkosBSPGEMM(HandleType *handle_, nnz_lno_t m_, nnz_lno_t n_, nnz_lno_t k_, + nnz_lno_t block_dim_, const_a_lno_row_view_t row_mapA_, + const_a_lno_nnz_view_t entriesA_, + const_a_scalar_nnz_view_t valsA_, bool transposeA_, + const_b_lno_row_view_t row_mapB_, + const_b_lno_nnz_view_t entriesB_, + const_b_scalar_nnz_view_t valsB_, bool transposeB_) + : Base(handle_, m_, n_, k_, row_mapA_, entriesA_, valsA_, transposeA_, + row_mapB_, entriesB_, valsB_, transposeB_), + block_dim(block_dim_) {} +}; + +} // namespace Impl +} // namespace KokkosSparse +#include "KokkosSparse_bspgemm_impl_kkmem.hpp" +#include "KokkosSparse_bspgemm_impl_speed.hpp" +#include "KokkosSparse_bspgemm_impl_def.hpp" +#endif diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp new file mode 100644 index 0000000000..36729f39ca --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_def.hpp @@ -0,0 +1,81 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +namespace KokkosSparse { + +namespace Impl { + +template +template +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric(c_row_view_t &rowmapC_, c_lno_nnz_view_t &entriesC_, + c_scalar_nnz_view_t &valuesC_) { + // get the algorithm and execution space. + // SPGEMMAlgorithm spgemm_algorithm = + // this->handle->get_spgemm_handle()->get_algorithm_type(); + KokkosKernels::Impl::ExecSpaceType my_exec_space_ = + KokkosKernels::Impl::get_exec_space_type(); + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "Numeric PHASE" << std::endl; + } + + if (Base::spgemm_algorithm == SPGEMM_KK_SPEED || + Base::spgemm_algorithm == SPGEMM_KK_DENSE) { + this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + my_exec_space_); + } else { + this->KokkosBSPGEMM_numeric_hash(rowmapC_, entriesC_, valuesC_, + my_exec_space_); + } +} + +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp new file mode 100644 index 0000000000..aae9d83b5f --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_kkmem.hpp @@ -0,0 +1,1658 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define HASHSCALAR 107 + +#include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_BlockHashmapAccumulator.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +template +struct KokkosBSPGEMM::PortableNumericCHASH { + using BlockAccumulator = KokkosKernels::Experimental::BlockHashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd>; + + static constexpr auto scalarAlignPad = + KokkosBSPGEMM::scalarAlignPad; + nnz_lno_t numrows; + nnz_lno_t block_dim; + const nnz_lno_t block_size; + size_t block_bytes; + + a_row_view_t row_mapA; + a_nnz_view_t entriesA; + a_scalar_view_t valuesA; + + b_row_view_t row_mapB; + b_nnz_view_t entriesB; + b_scalar_view_t valuesB; + + c_row_view_t rowmapC; + c_nnz_view_t entriesC; + c_scalar_view_t valuesC; + + nnz_lno_t *pEntriesC; + scalar_t *pvaluesC; + const size_t shared_memory_size; + const int vector_size; + pool_memory_type memory_space; + + // nnz_lno_t max_nnz; + const nnz_lno_t pow2_hash_size; + const nnz_lno_t max_nnz; + const nnz_lno_t pow2_hash_func; + const KokkosKernels::Impl::ExecSpaceType my_exec_space; + + const int unit_memory; // begins, nexts, and keys. No need for vals yet. + int team_size; + int thread_memory; + nnz_lno_t thread_shmem_key_size; + nnz_lno_t thread_shared_memory_hash_func; + nnz_lno_t thread_shmem_hash_size; + + nnz_lno_t team_shmem_key_size; + nnz_lno_t team_shared_memory_hash_func; + nnz_lno_t team_shmem_hash_size; + + nnz_lno_t team_cuckoo_key_size, team_cuckoo_hash_func; + + nnz_lno_t max_first_level_hash_size; + row_lno_persistent_work_view_t flops_per_row; + + PortableNumericCHASH( + nnz_lno_t block_dim_, nnz_lno_t m_, a_row_view_t row_mapA_, + a_nnz_view_t entriesA_, a_scalar_view_t valuesA_, + + b_row_view_t row_mapB_, b_nnz_view_t entriesB_, b_scalar_view_t valuesB_, + + c_row_view_t rowmapC_, c_nnz_view_t entriesC_, c_scalar_view_t valuesC_, + size_t shared_memory_size_, int vector_size_, pool_memory_type mpool_, + nnz_lno_t min_hash_size, nnz_lno_t max_nnz_, int team_size_, + const KokkosKernels::Impl::ExecSpaceType my_exec_space_, + double first_level_cut_off, row_lno_persistent_work_view_t flops_per_row_, + bool KOKKOSKERNELS_VERBOSE_) + : numrows(m_), + block_dim(block_dim_), + block_size(block_dim_ * block_dim_), + block_bytes(sizeof(scalar_t) * block_dim * block_dim), + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + pEntriesC(entriesC_.data()), + pvaluesC(valuesC_.data()), + shared_memory_size(shared_memory_size_ / 8 * 8), + vector_size(vector_size_), + memory_space(mpool_), + // max_nnz(), + pow2_hash_size(min_hash_size), + max_nnz(max_nnz_), + pow2_hash_func(min_hash_size - 1), + my_exec_space(my_exec_space_), + unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + block_bytes), + team_size(team_size_), + thread_memory((shared_memory_size / 8 / team_size_) * 8), + thread_shmem_key_size(), + thread_shared_memory_hash_func(), + thread_shmem_hash_size(1), + team_shmem_key_size(), + team_shared_memory_hash_func(), + team_shmem_hash_size(1), + team_cuckoo_key_size(1), + team_cuckoo_hash_func(1), + max_first_level_hash_size(1), + flops_per_row(flops_per_row_) + + { + nnz_lno_t tmp_team_cuckoo_key_size = + ((shared_memory_size - sizeof(nnz_lno_t) * 2) / + (sizeof(nnz_lno_t) + block_bytes)); + + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + team_cuckoo_hash_func = team_cuckoo_key_size - 1; + team_shmem_key_size = + ((shared_memory_size - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- sizeof(scalar_t): " + << sizeof(scalar_t) + << " sizeof(nnz_lno_t): " << sizeof(nnz_lno_t) + << " team_size: " << team_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size + << std::endl; + } + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + while (team_shmem_hash_size * 2 <= team_shmem_key_size) { + team_shmem_hash_size = team_shmem_hash_size * 2; + } + team_shared_memory_hash_func = team_shmem_hash_size - 1; + thread_shared_memory_hash_func = thread_shmem_hash_size - 1; + team_shmem_key_size = + team_shmem_key_size + + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + block_bytes); + team_shmem_key_size = (team_shmem_key_size >> 1) << 1; + + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + block_bytes); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " resized key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team shared_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " resized team key size:" << team_shmem_key_size + << std::endl; + } + + max_first_level_hash_size = first_level_cut_off * team_cuckoo_key_size; + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tPortableNumericCHASH -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tPortableNumericCHASH -- team_memory:" + << shared_memory_size << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" + << thread_shmem_hash_size + << " thread_shmem_key_size:" << thread_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" + << team_shmem_hash_size + << " team_shmem_key_size:" << team_shmem_key_size << std::endl; + std::cout << "\t\t team_cuckoo_key_size:" << team_cuckoo_key_size + << " team_cuckoo_hash_func:" << team_cuckoo_hash_func + << " max_first_level_hash_size:" << max_first_level_hash_size + << std::endl; + std::cout << "\t\t pow2_hash_size:" << pow2_hash_size + << " pow2_hash_func:" << pow2_hash_func << std::endl; + } + } + + void set_team_size(int team_size_) { + this->team_size = team_size_; + this->thread_memory = (shared_memory_size / 8 / team_size_) * 8; + } + + KOKKOS_INLINE_FUNCTION + size_t get_thread_id(const size_t row_index) const { + switch (my_exec_space) { + default: return row_index; +#if defined(KOKKOS_ENABLE_SERIAL) + case KokkosKernels::Impl::Exec_SERIAL: return 0; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + case KokkosKernels::Impl::Exec_OMP: + return Kokkos::OpenMP::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + case KokkosKernels::Impl::Exec_THREADS: + return Kokkos::Threads::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_CUDA) + case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined(KOKKOS_ENABLE_HIP) + case KokkosKernels::Impl::Exec_HIP: return row_index; +#endif + } + } + + // linear probing with tracking. + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag4 &, + const team_member_t &teamMember) const { + const nnz_lno_t team_row_begin = + teamMember.league_rank() * teamMember.team_size(); + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( + team_row_begin + teamMember.team_size(), numrows); + + volatile nnz_lno_t *tmp = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (tmp == NULL) { + tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid)); + } + + nnz_lno_t *used_indices = (nnz_lno_t *)(tmp); + tmp += max_nnz; + nnz_lno_t *hash_ids = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + + scalar_t *hash_values = + KokkosKernels::Impl::alignPtr(tmp); + + BlockAccumulator hm(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, hash_ids, hash_values); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + nnz_lno_t used_count = 0; + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + for (nnz_lno_t ii = 0; ii < left_work; ++ii) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *valA = valuesA.data() + a_col * block_size; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; + + for (nnz_lno_t i = 0; i < left_workB; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + const scalar_t *valB = valuesB.data() + adjind * block_size; + + hm.sequential_insert_into_hash_simple(b_col_ind, valA, valB, + used_count, used_indices); + } + } + size_type c_row_begin = rowmapC[row_index]; + hm.sequential_export_values_simple( + used_count, used_indices, pEntriesC + c_row_begin, + pvaluesC + c_row_begin * block_size); + }); + memory_space.release_chunk(used_indices); + } + + // assumes that the vector lane is 1, as in cpus + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag &, const team_member_t &teamMember) const { + const nnz_lno_t team_row_begin = + teamMember.league_rank() * teamMember.team_size(); + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( + team_row_begin + teamMember.team_size(), numrows); + + BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, nullptr, nullptr); + + volatile nnz_lno_t *tmp = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (tmp == NULL) { + tmp = (volatile nnz_lno_t *)(memory_space.allocate_chunk(tid)); + } + nnz_lno_t *globally_used_hash_indices = (nnz_lno_t *)tmp; + tmp += pow2_hash_size; + + hm2.hash_begins = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + hm2.hash_nexts = (nnz_lno_t *)(tmp); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + nnz_lno_t globally_used_hash_count = 0; + nnz_lno_t used_hash_sizes = 0; + + const size_type c_row_begin = rowmapC[row_index]; + + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin * block_size; + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + + for (nnz_lno_t ii = 0; ii < left_work; ++ii) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *a_val = valuesA.data() + a_col * block_size; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_workB = row_mapB(rowB + 1) - rowBegin; + + for (nnz_lno_t i = 0; i < left_workB; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + const scalar_t *b_val = valuesB.data() + adjind * block_size; + // nnz_lno_t hash = (b_col_ind * 107) & pow2_hash_func; + + // this has to be a success, we do not need to check for the + // success. int insertion = + hm2.sequential_insert_into_hash_mergeAdd_TrackHashes( + b_col_ind, a_val, b_val, &used_hash_sizes, + &globally_used_hash_count, globally_used_hash_indices); + } + } + for (nnz_lno_t i = 0; i < globally_used_hash_count; ++i) { + nnz_lno_t dirty_hash = globally_used_hash_indices[i]; + hm2.hash_begins[dirty_hash] = -1; + } + }); + memory_space.release_chunk(globally_used_hash_indices); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = + teamMember.league_rank() * teamMember.team_size(); + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( + team_row_begin + teamMember.team_size(), numrows); + + // int thread_memory = (shared_memory_size / 8 / teamMember.team_size()) * + // 8; + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shift it to the thread private part + all_shared_memory += thread_memory * teamMember.team_rank(); + + // used_hash_sizes hold the size of 1st and 2nd level hashes + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + nnz_lno_t *globally_used_hash_count = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + // int unit_memory = sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + sizeof + // (scalar_t) ; //begins, nexts, keys and vals . nnz_lno_t shmem_key_size = + // (thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory; if (shmem_key_size + // & 1) shmem_key_size -= 1; + + nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_hash_size; + + // points to the next elements + nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * thread_shmem_key_size; + // remainder of shmem allocation for vals + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + BlockAccumulator hm(block_dim, thread_shmem_key_size, + thread_shared_memory_hash_func, begins, nexts, keys, + vals); + + BlockAccumulator hm2(block_dim, pow2_hash_size, pow2_hash_func, nullptr, + nullptr, nullptr, nullptr); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + const size_type c_row_end = rowmapC[row_index + 1]; + const nnz_lno_t global_memory_hash_size = + nnz_lno_t(c_row_end - c_row_begin); + + bool is_global_alloced = false; + nnz_lno_t *globally_used_hash_indices = NULL; + + if (global_memory_hash_size > thread_shmem_key_size) { + volatile nnz_lno_t *tmp = NULL; + // size_t tid = get_thread_id(row_index); + // the code gets internal compiler error on gcc 4.7.2 + // assuming that this part only runs on GPUs for now, below fix + // has the exact same behaviour and runs okay. + size_t tid = row_index; + + while (tmp == NULL) { + Kokkos::single( + Kokkos::PerThread(teamMember), + [&](volatile nnz_lno_t *&memptr) { + memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk( + tid)); + }, + tmp); + } + + is_global_alloced = true; + globally_used_hash_indices = (nnz_lno_t *)tmp; + tmp += pow2_hash_size; + hm2.hash_begins = (nnz_lno_t *)(tmp); + tmp += pow2_hash_size; + hm2.hash_nexts = (nnz_lno_t *)(tmp); + } + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin * block_size; + + // initialize begins. + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, thread_shmem_hash_size), + [&](nnz_lno_t i) { begins[i] = -1; }); + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + globally_used_hash_count[0] = 0; + }); + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = row_mapA[row_index + 1] - col_begin; + nnz_lno_t ii = left_work; + // for ( nnz_lno_t ii = 0; ii < left_work; ++ii){ + while (ii-- > 0) { + size_type a_col = col_begin + ii; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *valA = valuesA.data() + a_col * block_size; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, left_work_), + [&](nnz_lno_t i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + const scalar_t *valB = valuesB.data() + adjind * block_size; + volatile int num_unsuccess = + hm.vector_atomic_insert_into_hash_mergeAdd( + b_col_ind, valA, valB, used_hash_sizes); + if (num_unsuccess) { + hm2.vector_atomic_insert_into_hash_mergeAdd_TrackHashes( + b_col_ind, valA, valB, used_hash_sizes + 1, + globally_used_hash_count, globally_used_hash_indices); + } + }); + } + + if (is_global_alloced) { + nnz_lno_t dirty_hashes = globally_used_hash_count[0]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, dirty_hashes), + [&](nnz_lno_t i) { + nnz_lno_t dirty_hash = globally_used_hash_indices[i]; + hm2.hash_begins[dirty_hash] = -1; + }); + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + memory_space.release_chunk(globally_used_hash_indices); + }); + } + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + if (used_hash_sizes[0] > thread_shmem_key_size) + used_hash_sizes[0] = thread_shmem_key_size; + }); + + nnz_lno_t num_elements = used_hash_sizes[0]; + + nnz_lno_t written_index = used_hash_sizes[1]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, num_elements), + [&](nnz_lno_t i) { + const auto idx = c_row_begin + written_index + i; + pEntriesC[idx] = keys[i]; + kk_block_set(block_dim, pvaluesC + idx * block_size, + vals + i * block_size); + }); + }); + } + + // one row does not fit into shmem, with thread-flat-parallel + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag6 &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = + teamMember.league_rank() * teamMember.team_size(); + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( + team_row_begin + teamMember.team_size(), numrows); + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + + // sizeof(scalar_t)*nvals + const nnz_lno_t init_value = -1; + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + int thread_rank = teamMember.team_rank(); + + int vector_rank = 0; + typedef typename std::remove_reference::type + atomic_incr_type; + Kokkos::parallel_scan( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &update, const bool final) { + if (final) { + vector_rank = update; + } + update += 1; + }); + int bs = vector_size * team_size; + int vector_shift = thread_rank * vector_size + vector_rank; + + for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; + ++row_index) { + if (row_mapA[row_index] == row_mapA[row_index + 1]) // skip empty A rows + continue; +#if 1 + teamMember.team_barrier(); +#endif + const size_type c_row_begin = rowmapC[row_index]; + const size_type c_row_end = rowmapC[row_index + 1]; + const nnz_lno_t c_row_size = c_row_end - c_row_begin; + nnz_lno_t *c_row = entriesC.data() + c_row_begin; + scalar_t *c_row_vals = valuesC.data() + c_row_begin * block_size; + nnz_lno_t *global_acc_row_keys = c_row; + scalar_t *global_acc_row_vals = c_row_vals; + volatile nnz_lno_t *tmp = NULL; + + if (c_row_size > max_first_level_hash_size) { + { + while (tmp == NULL) { + Kokkos::single( + Kokkos::PerTeam(teamMember), + [&](volatile nnz_lno_t *&memptr) { + memptr = (volatile nnz_lno_t *)(memory_space.allocate_chunk( + row_index)); + }, + tmp); + } + global_acc_row_keys = (nnz_lno_t *)(tmp); + global_acc_row_vals = + KokkosKernels::Impl::alignPtr( + tmp + pow2_hash_size); + } + // initialize begins. + { + nnz_lno_t num_threads = pow2_hash_size / vector_size; + // not needed as team_cuckoo_key_size is always pow2. + + // (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + const auto idx = teamind * vector_size + i; + kk_block_init(block_dim, + global_acc_row_vals + idx * block_size); + }); + }); + } + } + + // initialize begins. + { + nnz_lno_t num_threads = team_cuckoo_key_size / vector_size; + // not needed as team_cuckoo_key_size is always pow2. + + // (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + const auto idx = teamind * vector_size + i; + keys[idx] = init_value; + kk_block_init(block_dim, vals + idx * block_size); + }); + }); + } + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + }); + + bool insert_is_on = true; + const size_type a_col_begin_offset = row_mapA[row_index]; + + nnz_lno_t a_col_ind = entriesA[a_col_begin_offset]; + const scalar_t *a_val = valuesA.data() + a_col_begin_offset * block_size; + + nnz_lno_t current_a_column_offset_inrow = 0; + nnz_lno_t flops_on_the_left_of_offsett = 0; + size_type current_b_read_offsett = row_mapB[a_col_ind]; + nnz_lno_t current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + + nnz_lno_t row_flops = flops_per_row(row_index); + +#if 1 + teamMember.team_barrier(); +#endif + for (nnz_lno_t vector_read_shift = vector_shift; + vector_read_shift < row_flops; vector_read_shift += bs) { + { + nnz_lno_t my_b_col_shift = + vector_read_shift - flops_on_the_left_of_offsett; + nnz_lno_t my_b_col = init_value; + nnz_lno_t hash = init_value; + int fail = 0; + + if (my_b_col_shift >= current_a_column_flops) { + do { + ++current_a_column_offset_inrow; + my_b_col_shift -= current_a_column_flops; + flops_on_the_left_of_offsett += current_a_column_flops; + a_col_ind = + entriesA[a_col_begin_offset + current_a_column_offset_inrow]; + + current_b_read_offsett = row_mapB[a_col_ind]; + current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + } while (my_b_col_shift >= current_a_column_flops); + const auto idx = a_col_begin_offset + current_a_column_offset_inrow; + a_val = valuesA.data() + idx * block_size; + } + + const auto idx = my_b_col_shift + current_b_read_offsett; + my_b_col = entriesB[idx]; + const scalar_t *b_val = valuesB.data() + idx * block_size; + // now insert it to first level hashmap accumulator. + hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + fail = 1; + bool try_to_insert = true; + + // nnz_lno_t max_tries = team_cuckoo_key_size; + nnz_lno_t search_end = + team_cuckoo_key_size; // KOKKOSKERNELS_MACRO_MIN(team_cuckoo_key_size, + // hash + max_tries); + for (nnz_lno_t trial = hash; trial < search_end;) { + if (keys[trial] == my_b_col) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (!insert_is_on) { + try_to_insert = false; + break; + } else if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + Kokkos::atomic_increment(used_hash_sizes); + if (used_hash_sizes[0] > max_first_level_hash_size) + insert_is_on = false; + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + search_end = hash; // max_tries - (team_cuckoo_key_size - hash); + + for (nnz_lno_t trial = 0; try_to_insert && trial < search_end;) { + if (keys[trial] == my_b_col) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (!insert_is_on) { + break; + } else if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + Kokkos::atomic_increment(used_hash_sizes); + if (used_hash_sizes[0] > max_first_level_hash_size) + insert_is_on = false; + fail = 0; + break; + } + } else { + ++trial; + } + } + + if (fail) { + nnz_lno_t new_hash = (my_b_col * HASHSCALAR) & pow2_hash_func; + + for (nnz_lno_t trial = new_hash; trial < pow2_hash_size;) { + if (global_acc_row_keys[trial] == my_b_col) { + kk_vector_block_add_mul( + block_dim, global_acc_row_vals + trial * block_size, + a_val, b_val); + // c_row_vals[trial] += my_b_val; + fail = 0; + break; + } else if (global_acc_row_keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + global_acc_row_keys + trial, init_value, my_b_col)) { + kk_vector_block_add_mul( + block_dim, global_acc_row_vals + trial * block_size, + a_val, b_val); + // Kokkos::atomic_increment(used_hash_sizes + 1); + // c_row_vals[trial] = my_b_val; + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + for (nnz_lno_t trial = 0; trial < new_hash;) { + if (global_acc_row_keys[trial] == my_b_col) { + // c_row_vals[trial] += my_b_val; + kk_vector_block_add_mul( + block_dim, global_acc_row_vals + trial * block_size, + a_val, b_val); + break; + } else if (global_acc_row_keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + global_acc_row_keys + trial, init_value, + my_b_col)) { + // Kokkos::atomic_increment(used_hash_sizes + 1); + kk_vector_block_add_mul( + block_dim, global_acc_row_vals + trial * block_size, + a_val, b_val); + // c_row_vals[trial] = my_b_val; + break; + } + } else { + ++trial; + } + } + } + } + } + } + } + + teamMember.team_barrier(); + + if (tmp != NULL) { + for (nnz_lno_t my_index = vector_shift; my_index < pow2_hash_size; + my_index += bs) { + nnz_lno_t my_b_col = global_acc_row_keys[my_index]; + if (my_b_col != init_value) { + const scalar_t *b_val = global_acc_row_vals + my_index * block_size; + int fail = 1; + { + nnz_lno_t trial = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + for (nnz_lno_t max_tries = team_cuckoo_key_size; max_tries-- > 0; + trial = (trial + 1) & team_cuckoo_hash_func) { + if (keys[trial] == my_b_col) { + kk_block_add(block_dim, vals + trial * block_size, b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + break; + } + } + } + if (fail) { + nnz_lno_t write_index = 0; + write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, + atomic_incr_type(1)); + c_row[write_index] = my_b_col; + kk_block_set(block_dim, c_row_vals + write_index * block_size, + b_val); + } + global_acc_row_keys[my_index] = init_value; + } + } + + teamMember.team_barrier(); + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + memory_space.release_chunk(global_acc_row_keys); + }); + } + + for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size; + my_index += bs) { + nnz_lno_t my_key = keys[my_index]; + if (my_key != init_value) { + const scalar_t *my_val = vals + my_index * block_size; + nnz_lno_t write_index = 0; + write_index = Kokkos::atomic_fetch_add(used_hash_sizes + 1, + atomic_incr_type(1)); + c_row[write_index] = my_key; + kk_block_set(block_dim, c_row_vals + write_index * block_size, + my_val); + } + } + } + } + + // In this one row fits into shmem with team-flat-parallel + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag4 &, const team_member_t &teamMember) const { + const nnz_lno_t init_value = -1; + nnz_lno_t team_row_begin = + teamMember.league_rank() * teamMember.team_size(); + const nnz_lno_t team_row_end = KOKKOSKERNELS_MACRO_MIN( + team_row_begin + teamMember.team_size(), numrows); + + // shmem == sizeof(nnz_lno_t)*2 + sizeof(nnz_lno_t)*team_cuckoo_key_size + + // sizeof(scalar_t)*nvals + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * team_cuckoo_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + int thread_rank = teamMember.team_rank(); + + int vector_rank = 0; + typedef typename std::remove_reference::type + atomic_incr_type; + Kokkos::parallel_scan( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &update, const bool final) { + if (final) { + vector_rank = update; + } + update += 1; + }); + int bs = vector_size * team_size; + int vector_shift = thread_rank * vector_size + vector_rank; + for (nnz_lno_t row_index = team_row_begin; row_index < team_row_end; + ++row_index) { + if (row_mapA[row_index] == row_mapA[row_index + 1]) // skip empty A rows + continue; +#if 1 + teamMember.team_barrier(); +#endif + const size_type c_row_begin = rowmapC[row_index]; + // const size_type c_row_end = rowmapC[row_index + 1]; + // const nnz_lno_t c_row_size = c_row_end - c_row_begin; + nnz_lno_t *c_row = entriesC.data() + c_row_begin; + scalar_t *c_row_vals = valuesC.data() + c_row_begin * block_size; + + // initialize begins. + { + nnz_lno_t num_threads = + team_cuckoo_key_size / + vector_size; // not needed as team_cuckoo_key_size is always pow2. + // + (team_cuckoo_key_size & (vector_size - 1)) * 1; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, num_threads), + [&](nnz_lno_t teamind) { + // nnz_lno_t team_shift = teamind * vector_size; + // nnz_lno_t work_to_handle = KOKKOSKERNELS_MACRO_MIN(vector_size, + // team_shmem_hash_size - team_shift); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](nnz_lno_t i) { + const auto idx = teamind * vector_size + i; + keys[idx] = init_value; + kk_block_init(block_dim, vals + idx * block_size); + }); + }); + } + +#if 0 + teamMember.team_barrier(); + + Kokkos::single(Kokkos::PerTeam(teamMember),[&] () { + + for (int i = 0; i < team_shmem_hash_size; ++i){ + if (begins[i] != init_value){ + std::cout << "row_index:" << row_index << " i:" << i << " team_shmem_hash_size:" << team_shmem_hash_size << " is not init_value begins[i]:" << begins[i] << std::endl; + } + } + }); + + teamMember.team_barrier(); +#endif + // initialize hash usage sizes + Kokkos::single(Kokkos::PerTeam(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; +#if 0 + globally_used_hash_count[0] = 0; +#endif + }); +#if 0 + + teamMember.team_barrier(); +#endif +#if 0 + bool is_global_alloced = false; + nnz_lno_t *globally_used_hash_indices = NULL; +#endif + const size_type a_col_begin_offset = row_mapA[row_index]; + + nnz_lno_t a_col_ind = entriesA[a_col_begin_offset]; + const scalar_t *a_val = valuesA.data() + a_col_begin_offset * block_size; + + nnz_lno_t current_a_column_offset_inrow = 0; + nnz_lno_t flops_on_the_left_of_offsett = 0; + size_type current_b_read_offsett = row_mapB[a_col_ind]; + nnz_lno_t current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + + // nnz_lno_t ii = left_work; + nnz_lno_t row_flops = flops_per_row(row_index); + +#if 1 + teamMember.team_barrier(); +#endif + + for (nnz_lno_t vector_read_shift = vector_shift; + vector_read_shift < row_flops; vector_read_shift += bs) { + { + nnz_lno_t my_b_col_shift = + vector_read_shift - flops_on_the_left_of_offsett; + nnz_lno_t my_b_col = init_value; + nnz_lno_t hash = init_value; + int fail = 0; + + if (my_b_col_shift >= current_a_column_flops) { + do { + ++current_a_column_offset_inrow; + my_b_col_shift -= current_a_column_flops; + flops_on_the_left_of_offsett += current_a_column_flops; + a_col_ind = + entriesA[a_col_begin_offset + current_a_column_offset_inrow]; + + current_b_read_offsett = row_mapB[a_col_ind]; + current_a_column_flops = + row_mapB[a_col_ind + 1] - current_b_read_offsett; + } while (my_b_col_shift >= current_a_column_flops); + const auto idx = a_col_begin_offset + current_a_column_offset_inrow; + a_val = valuesA.data() + idx * block_size; + } + + my_b_col = entriesB[my_b_col_shift + current_b_read_offsett]; + + const auto idx = my_b_col_shift + current_b_read_offsett; + const scalar_t *b_val = valuesB.data() + idx * block_size; + + // now insert it to first level hashmap accumulator. + hash = (my_b_col * HASHSCALAR) & team_cuckoo_hash_func; + fail = 1; + + for (nnz_lno_t trial = hash; trial < team_cuckoo_key_size;) { + if (keys[trial] == my_b_col) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } + } else { + ++trial; + } + } + if (fail) { + for (nnz_lno_t trial = 0; trial < hash;) { + if (keys[trial] == my_b_col) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } else if (keys[trial] == init_value) { + if (Kokkos::atomic_compare_exchange_strong( + keys + trial, init_value, my_b_col)) { + kk_vector_block_add_mul(block_dim, vals + trial * block_size, + a_val, b_val); + fail = 0; + break; + } + } else { + ++trial; + } + } + } + } + } + + teamMember.team_barrier(); + for (nnz_lno_t my_index = vector_shift; my_index < team_cuckoo_key_size; + my_index += bs) { + nnz_lno_t my_key = keys[my_index]; + if (my_key != init_value) { + const scalar_t *my_val = vals + my_index * block_size; + nnz_lno_t write_index = + Kokkos::atomic_fetch_add(used_hash_sizes, atomic_incr_type(1)); + c_row[write_index] = my_key; + kk_block_set(block_dim, c_row_vals + write_index * block_size, + my_val); + } + } + } + } + + size_t team_shmem_size(int /* team_size */) const { + return shared_memory_size; + } +}; + +// +// * Notes on KokkosBSPGEMM_numeric_hash * +// +// Prior to this routine, KokkosBSPGEMM_numeric(...) was called +// +// KokkosBSPGEMM_numeric(...) : +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// call KokkosBSPGEMM_numeric_speed(...) +// else: +// call KokkosBSPGEMM_numeric_hash(...) (this code!) +// +// * NOTE: KokkosBSPGEMM_numeric_hash2(...) is not called +// +// +// KokkosBSPGEMM_numeric_hash: +// +// Algorithm selection may be modified as follows +// +// algorithm_to_run: initialized to spgemm_algorithm input to +// KokkosBSPGEMM_numeric_hash +// * spgemm_algorithm CANNOT be SPGEMM_KK_SPEED or SPGEMM_KK_DENSE +// +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// if Cuda enabled : +// 1. perform shmem-size + partition computations (used by +// HashMapAccumulator) and flop estimate +// 2. from results of 1. select from SPGEMM_KK_MEMORY_SPREADTEAM, +// SPGEMM_KK_MEMORY_BIGSPREADTEAM, SPGEMM_KK_MEMORY +// * Note: These shmem calculations are not passed along to the +// PortableNumericCHASH functor used by kernels +// TODO check the pre-shmem calculations and functor shmem +// calculations consistent - pass shmem values to functor +// else : +// 1. determine if problem is "dense" +// 2. if dense: call "this->KokkosBSPGEMM_numeric_speed" +// else : no change from algorithm_to_run; that is algorithm_to_run == +// SPGEMM_KK || SPGEMM_KK_LP +// +// else : +// skip modification of input algorithm +// +// +// +// Algorithm type matching to kernel Tag: +// +// Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp +// +// Cuda algorithm options: +// (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) : gpu_team_policy4_t, +// i.e. GPUTag4 (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) : +// gpu_team_policy6_t, i.e. GPUTag6 (default == SPGEMM_KK_MEMORY) : +// gpu_team_policy_t, i.e. GPUTag +// +// Non-Cuda host algorithm options: +// SPGEMM_KK_LP: +// (algorithm_to_run == SPGEMM_KK_LP + Dynamic) : +// dynamic_multicore_team_policy4_t, i.e. MultiCoreTag4 (algorithm_to_run +// == SPGEMM_KK_LP + Static) : dynamic_multicore_team_policy4_t // +// typo/bug, should be multicore_team_policy4_t? +// else SPGEMM::KKMEM +// kernel label: "KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC" : +// dynamic_multicore_team_policy_t, i.e. MultiCoreTag kernel label: +// "KOKKOSPARSE::SPGEMM::KKMEM::STATIC" : multicore_team_policy_t, i.e. +// MultiCoreTag + +template +template +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric_hash( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space) { + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\tHASH MODE" << std::endl; + } + KokkosSparse::SPGEMMAlgorithm algorithm_to_run = this->spgemm_algorithm; + nnz_lno_t brows = Base::row_mapB.extent(0) - 1; + size_type bnnz = Base::valsB.extent(0); + + int suggested_vector_size = + this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + size_t shmem_size_to_use = Base::shmem_size; + + row_lno_persistent_work_view_t flops_per_row = + this->handle->get_spgemm_handle()->row_flops; + size_t original_overall_flops = + this->handle->get_spgemm_handle()->original_overall_flops; + nnz_lno_t max_nnz = this->handle->get_spgemm_handle()->get_max_result_nnz(); + size_type overall_nnz = this->handle->get_spgemm_handle()->get_c_nnz(); + + typedef KokkosKernels::Impl::UniformMemoryPool + pool_memory_space; + nnz_lno_t min_hash_size = 1; + size_t chunksize = 1; + double first_level_cut_off = + this->handle->get_spgemm_handle()->get_first_level_hash_cut_off(); + int hash_scaler = + this->handle->get_spgemm_handle()->get_min_hash_size_scale(); + nnz_lno_t tmp_max_nnz = max_nnz; + + if (hash_scaler == 0) { + tmp_max_nnz = KOKKOSKERNELS_MACRO_MAX( + max_nnz, nnz_lno_t(this->b_col_cnt / this->concurrency + 1)); + } else { + tmp_max_nnz *= hash_scaler; + } + + // START OF SHARED MEMORY SIZE CALCULATIONS + // NOTE: the values computed here are not actually passed to functors + // requiring shmem, the calculations here are used for algorithm selection + const size_t block_bytes = sizeof(scalar_t) * block_dim * block_dim; + nnz_lno_t unit_memory = + sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + block_bytes; + nnz_lno_t team_shmem_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 4 - scalarAlignPad) / + unit_memory); + // alignment padding is per-thread for algorithms with per-thread hashmap + nnz_lno_t thread_memory = + ((shmem_size_to_use / suggested_team_size - scalarAlignPad) / 8) * 8; + + nnz_lno_t thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory); + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tinitial PortableNumericCHASH -- thread_memory:" + << thread_memory << " unit_memory:" << unit_memory + << " initial key size:" << thread_shmem_key_size << std::endl; + std::cout << "\t\tinitial PortableNumericCHASH -- team_memory:" + << shmem_size_to_use << " unit_memory:" << unit_memory + << " initial team key size:" << team_shmem_key_size << std::endl; + } + nnz_lno_t thread_shmem_hash_size = 1; + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + nnz_lno_t team_shmem_hash_size = 1; + while (team_shmem_hash_size * 2 <= team_shmem_key_size) { + team_shmem_hash_size = team_shmem_hash_size * 2; + } + // nnz_lno_t team_shared_memory_hash_func = team_shmem_hash_size - 1; + + team_shmem_key_size = + team_shmem_key_size + + ((team_shmem_key_size - team_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + block_bytes); + team_shmem_key_size = (team_shmem_key_size >> 1) << 1; + + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * sizeof(nnz_lno_t)) / + (sizeof(nnz_lno_t) * 2 + block_bytes); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + + // choose parameters + if (this->spgemm_algorithm == SPGEMM_KK || + SPGEMM_KK_LP == this->spgemm_algorithm) { + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + // then chose the best method and parameters. + size_type average_row_nnz = 0; + size_t average_row_flops = 0; + if (this->a_row_cnt > 0) { + average_row_nnz = overall_nnz / this->a_row_cnt; + average_row_flops = original_overall_flops / this->a_row_cnt; + } + int vector_length_max = + KokkosKernels::Impl::kk_get_max_vector_size(); + // if we have very low flops per row, or our maximum number of nnz is + // prett small, then we do row-base algorithm. + if (SPGEMM_KK_LP != this->spgemm_algorithm && + (average_row_nnz < (size_type)vector_length_max || + average_row_flops < 256)) { + algorithm_to_run = SPGEMM_KK_MEMORY; + // if (average_row_nnz / double (thread_shmem_key_size) > 1.5) + while (average_row_nnz > size_type(thread_shmem_key_size) && + suggested_vector_size < vector_length_max) { + suggested_vector_size = suggested_vector_size * 2; + suggested_vector_size = + KOKKOSKERNELS_MACRO_MIN(vector_length_max, suggested_vector_size); + suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + thread_memory = (shmem_size_to_use / 8 / suggested_team_size) * 8; + thread_shmem_key_size = + ((thread_memory - sizeof(nnz_lno_t) * 4) / unit_memory); + thread_shmem_hash_size = 1; + while (thread_shmem_hash_size * 2 <= thread_shmem_key_size) { + thread_shmem_hash_size = thread_shmem_hash_size * 2; + } + thread_shmem_key_size = + thread_shmem_key_size + + ((thread_shmem_key_size - thread_shmem_hash_size) * + sizeof(nnz_lno_t) - + scalarAlignPad) / + (sizeof(nnz_lno_t) * 2 + block_bytes); + thread_shmem_key_size = (thread_shmem_key_size >> 1) << 1; + } + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning KKMEM with suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << std::endl; + } + } else { + nnz_lno_t tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + block_bytes)); + int team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + suggested_vector_size = vector_length_max; + suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + algorithm_to_run = SPGEMM_KK_MEMORY_BIGSPREADTEAM; + while (average_row_nnz < + team_cuckoo_key_size / 2 * + (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) { + shmem_size_to_use = shmem_size_to_use / 2; + tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + block_bytes)); + team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + + suggested_team_size = suggested_team_size / 2; + } + if (average_row_flops > + size_t(2) * suggested_team_size * suggested_vector_size && + average_row_nnz > + size_type(team_cuckoo_key_size) * + (KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.05, 1))) { + shmem_size_to_use = shmem_size_to_use * 2; + tmp_team_cuckoo_key_size = + ((shmem_size_to_use - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + (sizeof(nnz_lno_t) + block_bytes)); + team_cuckoo_key_size = 1; + while (team_cuckoo_key_size * 2 < tmp_team_cuckoo_key_size) + team_cuckoo_key_size = team_cuckoo_key_size * 2; + suggested_team_size = suggested_team_size * 2; + } +#ifdef FIRSTPARAMS + suggested_team_size = KOKKOSKERNELS_MACRO_MAX(4, suggested_team_size); +#else + suggested_team_size = KOKKOSKERNELS_MACRO_MAX(2, suggested_team_size); +#endif + if (max_nnz < + team_cuckoo_key_size * + KOKKOSKERNELS_MACRO_MIN(first_level_cut_off + 0.20, 1)) { + algorithm_to_run = SPGEMM_KK_MEMORY_SPREADTEAM; + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_SPREADTEAM with " + "suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << " shmem_size_to_use:" << shmem_size_to_use + << std::endl; + } + } else { + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY_BIGSPREADTEAM with " + "suggested_vector_size:" + << suggested_vector_size + << " suggested_team_size:" << suggested_team_size + << " shmem_size_to_use:" << shmem_size_to_use + << std::endl; + } + } + } + } else { + bool run_dense = false; + nnz_lno_t max_column_cut_off = + this->handle->get_spgemm_handle()->MaxColDenseAcc; + nnz_lno_t col_size = this->b_col_cnt; + if (col_size < max_column_cut_off) { + run_dense = true; + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_DENSE col_size:" << col_size + << " max_column_cut_off:" << max_column_cut_off + << std::endl; + } + } else { + // round up maxNumRoughNonzeros to closest power of 2. + nnz_lno_t tmp_min_hash_size = 1; + while (tmp_max_nnz > tmp_min_hash_size) { + tmp_min_hash_size *= 4; + } + + size_t kkmem_chunksize = + tmp_min_hash_size; // this is for used hash indices + kkmem_chunksize += tmp_min_hash_size; // this is for the hash begins + kkmem_chunksize += max_nnz; // this is for hash nexts + kkmem_chunksize = kkmem_chunksize * sizeof(nnz_lno_t) + scalarAlignPad; + size_t dense_chunksize = + (col_size + col_size / block_bytes + 1) * block_bytes; + + if (kkmem_chunksize >= dense_chunksize * 0.5) { + run_dense = true; + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_SPEED kkmem_chunksize:" + << kkmem_chunksize + << " dense_chunksize:" << dense_chunksize << std::endl; + } + } else { + run_dense = false; + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t\tRunning SPGEMM_KK_MEMORY col_size:" << col_size + << " max_column_cut_off:" << max_column_cut_off + << std::endl; + } + } + } + + if (run_dense) { + this->KokkosBSPGEMM_numeric_speed(rowmapC_, entriesC_, valuesC_, + lcl_my_exec_space); + return; + } + } + } + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tPortableNumericCHASH -- adjusted hashsize:" + << thread_shmem_hash_size + << " thread_shmem_key_size:" << thread_shmem_key_size + << std::endl; + std::cout << "\t\tPortableNumericCHASH -- adjusted team hashsize:" + << team_shmem_hash_size + << " team_shmem_key_size:" << team_shmem_key_size << std::endl; + } + // END OF SHARED MEMORY SIZE CALCULATIONS + + // required memory for L2 + if (KokkosKernels::Impl::kk_is_gpu_exec_space< + typename HandleType::HandleExecSpace>()) { + if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { + tmp_max_nnz = 1; + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGTEAM || + algorithm_to_run == SPGEMM_KK_MEMORY_TEAM) { + // tmp_max_nnz -= team_shmem_key_size; + } else { + // tmp_max_nnz -= thread_shmem_key_size; + } + } + + // START SIZE CALCULATIONS FOR MEMORYPOOL + if (algorithm_to_run == SPGEMM_KK_LP) { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 4; + } + chunksize = min_hash_size; // this is for used hash keys + chunksize += max_nnz; // this is for used hash keys + chunksize += scalarAlignPad; // for padding betwen keys and values + chunksize += min_hash_size * block_bytes / + sizeof(nnz_lno_t); // this is for the hash values + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 2; // try to keep it as low as possible because hashes + // are not tracked. + } + chunksize = min_hash_size; // this is for used hash keys + chunksize += scalarAlignPad; // for padding between keys and values + chunksize += min_hash_size * block_bytes / + sizeof(nnz_lno_t); // this is for the hash values + } else { + while (tmp_max_nnz > min_hash_size) { + min_hash_size *= 4; + } + chunksize = min_hash_size; // this is for used hash indices + chunksize += min_hash_size; // this is for the hash begins + chunksize += max_nnz; // this is for hash nexts + } + + nnz_lno_t num_chunks = + this->template compute_num_pool_chunks( + chunksize * sizeof(nnz_lno_t), + this->concurrency / suggested_vector_size); + + // END SIZE CALCULATIONS FOR MEMORYPOOL + + if (this->KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\t max_nnz: " << max_nnz + << " min_hash_size:" << min_hash_size + << " concurrency:" << this->concurrency + << " MyExecSpace::concurrency():" << MyExecSpace::concurrency() + << " numchunks:" << num_chunks << std::endl; + } + + KokkosKernels::Impl::PoolType my_pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + my_pool_type = KokkosKernels::Impl::ManyThread2OneChunk; + } + + Kokkos::Timer timer1; + pool_memory_space m_space(num_chunks, chunksize, -1, my_pool_type); + MyExecSpace().fence(); + + if (this->KOKKOSKERNELS_VERBOSE) { + m_space.print_memory_pool(); + std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; + std::cout << "\t\tPool Size(MB):" + << sizeof(nnz_lno_t) * (num_chunks * chunksize) / 1024. / 1024. + << std::endl; + } + + PortableNumericCHASH< + const_a_lno_row_view_t, const_a_lno_nnz_view_t, const_a_scalar_nnz_view_t, + const_b_lno_row_view_t, const_b_lno_nnz_view_t, const_b_scalar_nnz_view_t, + c_row_view_t, c_lno_nnz_view_t, c_scalar_nnz_view_t, pool_memory_space> + sc(block_dim, this->a_row_cnt, Base::row_mapA, Base::entriesA, + Base::valsA, Base::row_mapB, Base::entriesB, Base::valsB, + + rowmapC_, entriesC_, valuesC_, shmem_size_to_use, + suggested_vector_size, m_space, min_hash_size, max_nnz, + suggested_team_size, + + lcl_my_exec_space, first_level_cut_off, flops_per_row, + this->KOKKOSKERNELS_VERBOSE); + + if (this->KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tvector_size:" << suggested_vector_size + << " suggested_team_size:" << suggested_team_size << std::endl; + } + timer1.reset(); + + if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { + if (algorithm_to_run == SPGEMM_KK_MEMORY_SPREADTEAM) { + if (thread_shmem_key_size <= 0) { + std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + "Insufficient shmem available for key for hash map " + "accumulator - Terminating" + << std::endl; + std::cout << " thread_shmem_key_size = " << thread_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_SPREADTEAM: " + "Insufficient shmem available for key for hash map accumulator "); + } + int max_team_size = gpu_team_policy4_t(1, 1, suggested_vector_size) + .team_size_max(sc, Kokkos::ParallelForTag()); + int team_size = std::min(suggested_team_size, max_team_size); + sc.set_team_size(team_size); + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_SPREADTEAM", + gpu_team_policy4_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), + sc); + MyExecSpace().fence(); + + } else if (algorithm_to_run == SPGEMM_KK_MEMORY_BIGSPREADTEAM) { + if (thread_shmem_key_size <= 0) { + std::cout << "KokkosBSPGEMM_numeric_hash " + "SPGEMM_KK_MEMORY_BIGSPREADTEAM: Insufficient shmem " + "available for key for hash map accumulator - Terminating" + << std::endl; + std::cout << " thread_shmem_key_size = " << thread_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY_BIGSPREADTEAM: " + "Insufficient shmem available for key for hash map accumulator "); + } + int max_team_size = gpu_team_policy6_t(1, 1, suggested_vector_size) + .team_size_max(sc, Kokkos::ParallelForTag()); + int team_size = std::min(suggested_team_size, max_team_size); + sc.set_team_size(team_size); + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY_BIGSPREADTEAM", + gpu_team_policy6_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), + sc); + } else { + if (team_shmem_key_size <= 0) { + std::cout << "KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: " + "Insufficient shmem " + "available for key for hash map accumulator - Terminating" + << std::endl; + std::cout << " team_shmem_key_size = " << team_shmem_key_size + << std::endl; + throw std::runtime_error( + " KokkosBSPGEMM_numeric_hash SPGEMM_KK_MEMORY: Insufficient shmem " + "available for key for hash map accumulator "); + } + int max_team_size = gpu_team_policy_t(1, 1, suggested_vector_size) + .team_size_max(sc, Kokkos::ParallelForTag()); + int team_size = std::min(suggested_team_size, max_team_size); + sc.set_team_size(team_size); + Kokkos::parallel_for( + "KOKKOSPARSE::SPGEMM::SPGEMM_KK_MEMORY", + gpu_team_policy_t((this->a_row_cnt + team_size - 1) / team_size, + team_size, suggested_vector_size), + sc); + } + MyExecSpace().fence(); + } else { + if (algorithm_to_run == SPGEMM_KK_LP) { + if (Base::use_dynamic_schedule) { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::DYNAMIC", + dynamic_multicore_team_policy4_t( + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::SPGEMM_KK_LP::STATIC", + multicore_team_policy4_t( + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, + suggested_team_size, suggested_vector_size), + sc); + } + } else { + if (Base::use_dynamic_schedule) { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::DYNAMIC", + dynamic_multicore_team_policy_t( + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for("KOKKOSPARSE::SPGEMM::KKMEM::STATIC", + multicore_team_policy_t( + (this->a_row_cnt + suggested_team_size - 1) / + suggested_team_size, + suggested_team_size, suggested_vector_size), + sc); + } + } + MyExecSpace().fence(); + } + + if (this->KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + } +} + +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp new file mode 100644 index 0000000000..312ba22f8a --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_seq.hpp @@ -0,0 +1,182 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_ +#define KOKKOSSPARSE_BSPGEMM_DEBUG_HPP_ +#include "KokkosKernels_helpers.hpp" +#include "KokkosBatched_Gemm_Serial_Internal.hpp" +#include + +namespace KokkosSparse { + +namespace Impl { + +template +using kk_subview1d = + decltype(Kokkos::subview(data_view_t(), Kokkos::make_pair(0, 0))); + +// Returns subview +template +KOKKOS_INLINE_FUNCTION kk_subview1d get_block( + data_view_t data, size_type block_index, lno_t block_size) { + const auto i = block_index * block_size; + return Kokkos::subview(data, Kokkos::make_pair(i, i + block_size)); +} + +template +void bspgemm_debug_numeric(KernelHandle* /* handle */, + typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t /* n */, + typename KernelHandle::nnz_lno_t k, + typename KernelHandle::nnz_lno_t block_dim, + alno_row_view_t_ row_mapA, alno_nnz_view_t_ entriesA, + ascalar_nnz_view_t_ valuesA, + + bool /* transposeA */, blno_row_view_t_ row_mapB, + blno_nnz_view_t_ entriesB, + bscalar_nnz_view_t_ valuesB, bool /* transposeB */, + clno_row_view_t_ row_mapC, clno_nnz_view_t_ entriesC, + cscalar_nnz_view_t_ valuesC) { + typename alno_row_view_t_::HostMirror h_rma = + Kokkos::create_mirror_view(row_mapA); + Kokkos::deep_copy(h_rma, row_mapA); + typename alno_nnz_view_t_::HostMirror h_enta = + Kokkos::create_mirror_view(entriesA); + Kokkos::deep_copy(h_enta, entriesA); + typename ascalar_nnz_view_t_::HostMirror h_vala = + Kokkos::create_mirror_view(valuesA); + Kokkos::deep_copy(h_vala, valuesA); + + typename blno_row_view_t_::HostMirror h_rmb = + Kokkos::create_mirror_view(row_mapB); + Kokkos::deep_copy(h_rmb, row_mapB); + typename blno_nnz_view_t_::HostMirror h_entb = + Kokkos::create_mirror_view(entriesB); + Kokkos::deep_copy(h_entb, entriesB); + typename bscalar_nnz_view_t_::HostMirror h_valb = + Kokkos::create_mirror_view(valuesB); + Kokkos::deep_copy(h_valb, valuesB); + typename clno_row_view_t_::HostMirror h_rmc = + Kokkos::create_mirror_view(row_mapC); + Kokkos::deep_copy(h_rmc, row_mapC); + + typename clno_nnz_view_t_::HostMirror h_entc = + Kokkos::create_mirror_view(entriesC); + typename cscalar_nnz_view_t_::HostMirror h_valc = + Kokkos::create_mirror_view(valuesC); + Kokkos::fence(); + + typedef typename KernelHandle::nnz_lno_t lno_t; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_scalar_t scalar_t; + typedef KokkosBatched::SerialGemmInternal< + KokkosBatched::Algo::Gemm::Unblocked> + GEMM; + + const auto block_size = block_dim * block_dim; + const auto ZERO = static_cast(0); + const auto ONE = static_cast(1); + + typename cscalar_nnz_view_t_::HostMirror accumulator("acc", k * block_size); + Kokkos::deep_copy(accumulator, ZERO); + Kokkos::fence(); + std::vector acc_flag(k, false); + + h_rmc(0) = 0; + for (lno_t i = 0; i < m; ++i) { + const size_type a_row_begin = h_rma(i); + const size_type a_row_end = h_rma(i + 1); + lno_t a_row_size = a_row_end - a_row_begin; + + size_type c_row_begin = h_rmc(i); + lno_t c_row_size = h_rmc(i + 1) - c_row_begin; + lno_t c_row_size_counter = 0; + + for (lno_t j = 0; j < a_row_size; ++j) { + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + auto a_val = &h_vala(a_ind * block_size); + const size_type b_row_begin = h_rmb(col); + const size_type b_row_end = h_rmb(col + 1); + lno_t b_row_size = b_row_end - b_row_begin; + for (lno_t z = 0; z < b_row_size; ++z) { + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + auto b_val = &h_valb(b_ind * block_size); + + if (acc_flag[b_col] == false) { + acc_flag[b_col] = true; + h_entc(c_row_begin + c_row_size_counter++) = b_col; + } + // accumulator(b_col) += a_val * b_val + auto acc = get_block(accumulator, b_col, block_size); + GEMM::invoke(block_dim, block_dim, block_dim, ONE, a_val, block_dim, 1, + b_val, block_dim, 1, ONE, acc.data(), block_dim, 1); + } + } + + // if (i == 0) std::cout << "result_cols" << std::endl; + + for (lno_t j = 0; j < c_row_size; ++j) { + size_type c_ind = c_row_begin + j; + lno_t result_col = h_entc(c_ind); + auto acc = get_block(accumulator, result_col, block_size); + Kokkos::deep_copy(get_block(h_valc, c_ind, block_size), acc); + Kokkos::deep_copy(acc, ZERO); + Kokkos::fence(); + acc_flag[result_col] = false; + } + } + + Kokkos::deep_copy(entriesC, h_entc); + Kokkos::deep_copy(valuesC, h_valc); + Kokkos::fence(); +} + +} // namespace Impl +} // namespace KokkosSparse +#endif diff --git a/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp new file mode 100644 index 0000000000..372e5d10dd --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_impl_speed.hpp @@ -0,0 +1,657 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_Utils.hpp" + +namespace KokkosSparse { + +namespace Impl { + +template +template +struct KokkosBSPGEMM::NumericCMEM_CPU { + using BSPGEMM = KokkosBSPGEMM; + + nnz_lno_t numrows; + nnz_lno_t numcols; + nnz_lno_t block_dim; + nnz_lno_t block_size; + + a_row_view_t row_mapA; + a_nnz_view_t entriesA; + a_scalar_view_t valuesA; + + b_row_view_t row_mapB; + b_nnz_view_t entriesB; + b_scalar_view_t valuesB; + + c_row_view_t rowmapC; + c_nnz_view_t entriesC; + c_scalar_view_t valuesC; + mpool_type memory_space; + + nnz_lno_t *pEntriesC; + scalar_t *pVals; + const KokkosKernels::Impl::ExecSpaceType my_exec_space; + const nnz_lno_t team_work_size; + + NumericCMEM_CPU(nnz_lno_t m_, nnz_lno_t k_, nnz_lno_t block_dim_, + a_row_view_t row_mapA_, a_nnz_view_t entriesA_, + a_scalar_view_t valuesA_, + + b_row_view_t row_mapB_, b_nnz_view_t entriesB_, + b_scalar_view_t valuesB_, + + c_row_view_t rowmapC_, c_nnz_view_t entriesC_, + c_scalar_view_t valuesC_, mpool_type memory_space_, + const KokkosKernels::Impl::ExecSpaceType my_exec_space_, + nnz_lno_t team_row_chunk_size) + : numrows(m_), + numcols(k_), + block_dim(block_dim_), + block_size(block_dim_ * block_dim_), + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + memory_space(memory_space_), + pEntriesC(entriesC_.data()), + pVals(valuesC.data()), + my_exec_space(my_exec_space_), + team_work_size(team_row_chunk_size) {} + + KOKKOS_INLINE_FUNCTION + size_t get_thread_id(const size_t row_index) const { + switch (my_exec_space) { + default: return row_index; +#if defined(KOKKOS_ENABLE_SERIAL) + case KokkosKernels::Impl::Exec_SERIAL: return 0; +#endif +#if defined(KOKKOS_ENABLE_OPENMP) + case KokkosKernels::Impl::Exec_OMP: + return Kokkos::OpenMP::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_THREADS) + case KokkosKernels::Impl::Exec_THREADS: + return Kokkos::Threads::impl_hardware_thread_id(); +#endif +#if defined(KOKKOS_ENABLE_CUDA) + case KokkosKernels::Impl::Exec_CUDA: return row_index; +#endif +#if defined(KOKKOS_ENABLE_HIP) + case KokkosKernels::Impl::Exec_HIP: return row_index; +#endif + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const MultiCoreTag &, const team_member_t &teamMember) const { + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + scalar_t *dense_accum = NULL; + size_t tid = get_thread_id(team_row_begin + teamMember.team_rank()); + while (dense_accum == NULL) { + dense_accum = (scalar_t *)(memory_space.allocate_chunk(tid)); + } + char *marker = (char *)(dense_accum + numcols * block_size); + + // Performs C[row_index,b_col_ind] += A[row_index,rowB] * B[rowB,b_col_ind] + // using dense_accum[col] to accumulate scalar values, + // marker[col] for boolean flags denoting initialized accumulators + // and col=pEntriesC[i] to index sparse column indices. + // Note: each CPU thread works on its own row, thus no need for locking. + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + nnz_lno_t *myentries = pEntriesC + c_row_begin; + scalar_t *myvals = pVals + c_row_begin * block_size; + + nnz_lno_t current_col_index = 0; + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t row_size = row_mapA[row_index + 1] - col_begin; + + for (nnz_lno_t colind = 0; colind < row_size; ++colind) { + size_type a_col = colind + col_begin; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *a_val = &valuesA[a_col * block_size]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work = row_mapB(rowB + 1) - rowBegin; + for (int i = 0; i < left_work; ++i) { + const size_type adjind = i + rowBegin; + nnz_lno_t b_col_ind = entriesB[adjind]; + const scalar_t *b_val = &valuesB[adjind * block_size]; + if (marker[b_col_ind] == 0) { + marker[b_col_ind] = 1; + myentries[current_col_index++] = b_col_ind; + } + kk_block_add_mul(block_dim, dense_accum + b_col_ind * block_size, + a_val, b_val); + } + } + for (nnz_lno_t i = 0; i < current_col_index; ++i) { + nnz_lno_t ind = myentries[i]; + scalar_t *acc = dense_accum + ind * block_size; + kk_block_set(block_dim, myvals + i * block_size, acc); + kk_block_init(block_dim, acc); + marker[ind] = 0; + } + }); + memory_space.release_chunk(dense_accum); + } +}; + +template +template + +struct KokkosBSPGEMM::NumericCMEM { + static constexpr auto scalarAlignPad = + KokkosBSPGEMM::scalarAlignPad; + + nnz_lno_t numrows; + nnz_lno_t block_dim; + nnz_lno_t block_size; + + a_row_view_t__ row_mapA; + a_nnz_view_t__ entriesA; + a_scalar_view_t__ valuesA; + + b_row_view_t__ row_mapB; + b_nnz_view_t__ entriesB; + b_scalar_view_t__ valuesB; + + c_row_view_t__ rowmapC; + c_nnz_view_t__ entriesC; + c_scalar_view_t__ valuesC; + + c_nnz_tmp_view_t beginsC; + c_nnz_tmp_view_t nextsC; + + nnz_lno_t *pbeginsC, *pnextsC, *pEntriesC; + scalar_t *pvaluesC; + + const size_t shared_memory_size; + const int vector_size; + const nnz_lno_t team_work_size; + + const int unit_memory; // begins, nexts, and keys. No need for vals yet. + const int suggested_team_size; + const int thread_memory; + nnz_lno_t shmem_key_size; + nnz_lno_t shared_memory_hash_func; + nnz_lno_t shmem_hash_size; + + NumericCMEM(nnz_lno_t m_, nnz_lno_t block_dim_, a_row_view_t__ row_mapA_, + a_nnz_view_t__ entriesA_, a_scalar_view_t__ valuesA_, + + b_row_view_t__ row_mapB_, b_nnz_view_t__ entriesB_, + b_scalar_view_t__ valuesB_, + + c_row_view_t__ rowmapC_, c_nnz_view_t__ entriesC_, + c_scalar_view_t__ valuesC_, + + c_nnz_tmp_view_t beginsC_, c_nnz_tmp_view_t nextsC_, + + const size_type sharedMemorySize_, + const int suggested_vector_size, + const nnz_lno_t team_row_chunk_size, int suggested_team_size_, + bool KOKKOSKERNELS_VERBOSE_) + : numrows(m_), + block_dim(block_dim_), + block_size(block_dim_ * block_dim_), + + row_mapA(row_mapA_), + entriesA(entriesA_), + valuesA(valuesA_), + + row_mapB(row_mapB_), + entriesB(entriesB_), + valuesB(valuesB_), + + rowmapC(rowmapC_), + entriesC(entriesC_), + valuesC(valuesC_), + beginsC(beginsC_), + nextsC(nextsC_), + pbeginsC(beginsC_.data()), + pnextsC(nextsC_.data()), + pEntriesC(entriesC_.data()), + pvaluesC(valuesC_.data()), + shared_memory_size(sharedMemorySize_), + + vector_size(suggested_vector_size), + team_work_size(team_row_chunk_size), + + unit_memory(sizeof(nnz_lno_t) * 2 + sizeof(nnz_lno_t) + + sizeof(scalar_t) * block_size), + suggested_team_size(suggested_team_size_), + thread_memory((shared_memory_size / 8 / suggested_team_size_) * 8), + shmem_key_size(), + shared_memory_hash_func(), + shmem_hash_size(1) { + shmem_key_size = ((thread_memory - sizeof(nnz_lno_t) * 2 - scalarAlignPad) / + unit_memory); + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tNumericCMEM -- thread_memory:" << thread_memory + << " unit_memory:" << unit_memory + << " initial key size:" << shmem_key_size << std::endl; + } + while (shmem_hash_size * 2 <= shmem_key_size) { + shmem_hash_size = shmem_hash_size * 2; + } + shared_memory_hash_func = shmem_hash_size - 1; + shmem_key_size = shmem_key_size + + ((shmem_key_size - shmem_hash_size) * sizeof(nnz_lno_t)) / + (unit_memory - sizeof(nnz_lno_t)); + shmem_key_size = (shmem_key_size >> 1) << 1; + + if (KOKKOSKERNELS_VERBOSE_) { + std::cout << "\t\tNumericCMEM -- adjusted hashsize:" << shmem_hash_size + << " shmem_key_size:" << shmem_key_size << std::endl; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const GPUTag &, const team_member_t &teamMember) const { + // get the beginning and end rows of the team. + nnz_lno_t team_row_begin = teamMember.league_rank() * team_work_size; + const nnz_lno_t team_row_end = + KOKKOSKERNELS_MACRO_MIN(team_row_begin + team_work_size, numrows); + + char *all_shared_memory = + (char *)(teamMember.team_shmem().get_shmem(shared_memory_size)); + + // shift it to the thread private part + all_shared_memory += thread_memory * teamMember.team_rank(); + + // used_hash_sizes hold the size of 1st and 2nd level hashes + volatile nnz_lno_t *used_hash_sizes = + (volatile nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * 2; + + nnz_lno_t *begins = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_hash_size; + + // poins to the next elements + nnz_lno_t *nexts = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + + // holds the keys + nnz_lno_t *keys = (nnz_lno_t *)(all_shared_memory); + all_shared_memory += sizeof(nnz_lno_t) * shmem_key_size; + scalar_t *vals = + KokkosKernels::Impl::alignPtr(all_shared_memory); + + KokkosKernels::Experimental::BlockHashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::bitwiseAnd> + hm(block_dim, shmem_key_size, shared_memory_hash_func, begins, nexts, + keys, vals); + + // issue-508, TODO: understand and re-work below parallel_for loop. + // Inialize hm2 with correct max_value_size and hashOpRHS + // global_memory_hash_size is computed, per team of threads -- this is + // hashOpRHS. + + KokkosKernels::Experimental::BlockHashmapAccumulator< + nnz_lno_t, nnz_lno_t, scalar_t, + KokkosKernels::Experimental::HashOpType::modulo> + hm2(block_dim, 0, 0, NULL, NULL, NULL, NULL); + /* + KokkosKernels::Experimental::HashmapAccumulator + hm2(global_memory_hash_size, global_memory_hash_size, + pbeginsC + c_row_begin, pnextsC + c_row_begin, pEntriesC + c_row_begin, + pvaluesC + c_row_begin); + */ + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(teamMember, team_row_begin, team_row_end), + [&](const nnz_lno_t &row_index) { + const size_type c_row_begin = rowmapC[row_index]; + const nnz_lno_t global_memory_hash_size = + nnz_lno_t(rowmapC[row_index + 1] - c_row_begin); + + hm2.keys = pEntriesC + c_row_begin; + hm2.values = pvaluesC + c_row_begin * block_size; + hm2.hash_begins = pbeginsC + c_row_begin; + hm2.hash_nexts = pnextsC + c_row_begin; + + // initialize begins. + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, shmem_hash_size), + [&](int i) { begins[i] = -1; }); + + // initialize hash usage sizes + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + used_hash_sizes[0] = 0; + used_hash_sizes[1] = 0; + }); + + const size_type col_begin = row_mapA[row_index]; + const nnz_lno_t left_work = + nnz_lno_t(row_mapA[row_index + 1] - col_begin); + + for (nnz_lno_t colind = 0; colind < left_work; ++colind) { + size_type a_col = colind + col_begin; + nnz_lno_t rowB = entriesA[a_col]; + const scalar_t *a_val = &valuesA[a_col * block_size]; + + size_type rowBegin = row_mapB(rowB); + nnz_lno_t left_work_ = row_mapB(rowB + 1) - rowBegin; + + while (left_work_) { + nnz_lno_t work_to_handle = + KOKKOSKERNELS_MACRO_MIN(vector_size, left_work_); + nnz_lno_t b_col_ind = -1; + const scalar_t *b_val = nullptr; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, work_to_handle), + [&](nnz_lno_t i) { + const size_type adjind = i + rowBegin; + b_col_ind = entriesB[adjind]; + b_val = &valuesB[adjind * block_size]; + }); + + int num_unsuccess = hm.vector_atomic_insert_into_hash_mergeAdd( + b_col_ind, a_val, b_val, used_hash_sizes); + + int overall_num_unsuccess = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(teamMember, vector_size), + [&](const int /* threadid */, int &overall_num_unsuccess_) { + overall_num_unsuccess_ += num_unsuccess; + }, + overall_num_unsuccess); + + if (overall_num_unsuccess) { + nnz_lno_t hash_ = -1; + if (num_unsuccess) { + hash_ = b_col_ind % global_memory_hash_size; + } + + // int insertion = + hm2.vector_atomic_insert_into_hash_mergeAdd_with_team_level_list_length( + teamMember, vector_size, hash_, b_col_ind, a_val, b_val, + used_hash_sizes + 1, global_memory_hash_size); + } + left_work_ -= work_to_handle; + rowBegin += work_to_handle; + } + } + + Kokkos::single(Kokkos::PerThread(teamMember), [&]() { + if (used_hash_sizes[0] > shmem_key_size) + used_hash_sizes[0] = shmem_key_size; + }); + + size_type num_elements = used_hash_sizes[0]; + + size_type written_index = used_hash_sizes[1]; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(teamMember, num_elements), + [&](size_type i) { + const auto idx = c_row_begin + written_index + i; + pEntriesC[idx] = keys[i]; + kk_block_set(block_dim, pvaluesC + idx * block_size, + vals + i * block_size); + }); + }); + } + + size_t team_shmem_size(int /* team_size */) const { + return shared_memory_size; + } +}; + +// +// * Notes on KokkosBSPGEMM_numeric_speed * +// +// Prior to this routine, KokkosBSPGEMM_numeric(...) was called +// +// KokkosBSPGEMM_numeric(...) : +// if (this->spgemm_algorithm == SPGEMM_KK || SPGEMM_KK_LP == +// this->spgemm_algorithm) : +// call KokkosBSPGEMM_numeric_speed(...) +// else: +// call KokkosBSPGEMM_numeric_hash(...) +// +// +// KokkosBSPGEMM_numeric_speed: +// +// Algorithm selection as follows and matching to kernel Tag: +// +// Policy typedefs with tags found in: KokkosSparse_spgemm_impl.hpp +// +// if GPU: +// "KokkosSparse::NumericCMEM::KKSPEED::GPU" : gpu_team_policy_t, i.e. +// GPUTag +// +// else : +// "KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC" : +// dynamic_multicore_team_policy_t, i.e. MultiCoreTag +// "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC" : multicore_team_policy_t, +// i.e. MultiCoreTag +// + +template +template +void KokkosBSPGEMM:: + KokkosBSPGEMM_numeric_speed( + c_row_view_t rowmapC_, c_lno_nnz_view_t entriesC_, + c_scalar_nnz_view_t valuesC_, + KokkosKernels::Impl::ExecSpaceType my_exec_space_) { + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\tSPEED MODE" << std::endl; + } + + nnz_lno_t brows = this->row_mapB.extent(0) - 1; + size_type bnnz = this->valsB.extent(0); + + // get suggested vector size, teamsize and row chunk size. + int suggested_vector_size = + this->handle->get_suggested_vector_size(brows, bnnz); + int suggested_team_size = + this->handle->get_suggested_team_size(suggested_vector_size); + nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( + suggested_team_size, this->concurrency, Base::a_row_cnt); + + Kokkos::Timer numeric_speed_timer_with_free; + + if (KokkosKernels::Impl::kk_is_gpu_exec_space< + typename HandleType::HandleExecSpace>()) { + // allocate memory for begins and next to be used by the hashmap + nnz_lno_temp_work_view_t beginsC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C keys"), + valuesC_.extent(0)); + nnz_lno_temp_work_view_t nextsC( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "C nexts"), + valuesC_.extent(0)); + Kokkos::deep_copy(beginsC, -1); + + // create the functor. + NumericCMEM + sc(Base::a_row_cnt, block_dim, this->row_mapA, this->entriesA, + this->valsA, this->row_mapB, this->entriesB, this->valsB, + + rowmapC_, entriesC_, valuesC_, + + beginsC, nextsC, this->shmem_size, suggested_vector_size, + team_row_chunk_size, suggested_team_size, + Base::KOKKOSKERNELS_VERBOSE); + + Kokkos::Timer timer1; + MyExecSpace().fence(); + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tGPU vector_size:" << suggested_vector_size + << " team_size:" << suggested_team_size + << " chunk_size:" << team_row_chunk_size << std::endl; + } + + timer1.reset(); + // this is basically kkmem without memory pools. + // only executed for to check the effect of memory pools. + Kokkos::parallel_for( + "KokkosSparse::NumericCMEM::KKSPEED::GPU", + gpu_team_policy_t(Base::a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + MyExecSpace().fence(); + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + } + } else { + Kokkos::Timer numeric_speed_timer; + typedef KokkosKernels::Impl::UniformMemoryPool + pool_memory_space; + + KokkosKernels::Impl::PoolType my_pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + int num_chunks = this->concurrency; + + Kokkos::Timer timer1; + const size_t chunk_size = this->b_col_cnt * block_dim * block_dim + + this->b_col_cnt / sizeof(scalar_t) + 1; + pool_memory_space m_space(num_chunks, chunk_size, 0, my_pool_type); + MyExecSpace().fence(); + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tPool Alloc Time:" << timer1.seconds() << std::endl; + std::cout << "\tPool Size(MB):" + << sizeof(scalar_t) * (num_chunks * chunk_size) / 1024. / 1024. + << std::endl; + } + + NumericCMEM_CPU + sc(Base::a_row_cnt, this->b_col_cnt, block_dim, this->row_mapA, + this->entriesA, this->valsA, this->row_mapB, this->entriesB, + this->valsB, + + rowmapC_, entriesC_, valuesC_, m_space, my_exec_space_, + team_row_chunk_size); + + MyExecSpace().fence(); + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tCPU vector_size:" << suggested_vector_size + << " team_size:" << suggested_team_size + << " chunk_size:" << team_row_chunk_size << std::endl; + } + timer1.reset(); + + if (this->use_dynamic_schedule) { + Kokkos::parallel_for("KokkosSparse::NumericCMEM_CPU::DENSE::DYNAMIC", + dynamic_multicore_team_policy_t( + Base::a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } else { + Kokkos::parallel_for( + "KokkosSparse::NumericCMEM_CPU::DENSE::STATIC", + multicore_team_policy_t(Base::a_row_cnt / team_row_chunk_size + 1, + suggested_team_size, suggested_vector_size), + sc); + } + + MyExecSpace().fence(); + + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric TIME:" << timer1.seconds() << std::endl; + std::cout << "\t\tNumeric SPEED TIME:" << numeric_speed_timer.seconds() + << std::endl; + } + } + if (Base::KOKKOSKERNELS_VERBOSE) { + std::cout << "\t\tNumeric SPEED TIME WITH FREE:" + << numeric_speed_timer_with_free.seconds() << std::endl; + } +} +} // namespace Impl +} // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp new file mode 100644 index 0000000000..d87c49bd55 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_bspgemm_numeric_spec.hpp @@ -0,0 +1,407 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_BSPGEMM_NUMERIC_SPEC_HPP_ + +#include + +#include +//#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +//#include "KokkosSparse_spgemm_symbolic.hpp" +#include "KokkosSparse_spgemm_cuSPARSE_impl.hpp" +#include "KokkosSparse_spgemm_CUSP_impl.hpp" +#include "KokkosSparse_bspgemm_impl.hpp" +#include "KokkosSparse_bspgemm_impl_seq.hpp" +#include "KokkosSparse_spgemm_mkl_impl.hpp" +#include "KokkosSparse_spgemm_mkl2phase_impl.hpp" +#include "KokkosSparse_spgemm_viennaCL_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct bspgemm_numeric_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template <> \ + struct bspgemm_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; \ + \ + template <> \ + struct bspgemm_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +//#include +#include + +namespace KokkosSparse { +namespace Impl { + +// For future use (when TPL with block SpGEMM numeric phase is encountered) +template +struct bspgemm_numeric_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Unification layer +/// \brief Implementation of BSR sparse block matrix - matrix multiplication + +template ::value, + bool eti_spec_avail = bspgemm_numeric_eti_spec_avail< + KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, + b_size_view_t_, b_lno_view_t, b_scalar_view_t, c_size_view_t_, + c_lno_view_t, c_scalar_view_t>::value> +struct BSPGEMM_NUMERIC { + static void bspgemm_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + typename KernelHandle::const_nnz_lno_t k, + typename KernelHandle::const_nnz_lno_t blockDim, + a_size_view_t_ row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + + bool transposeA, b_size_view_t_ row_mapB, + b_lno_view_t entriesB, b_scalar_view_t valuesB, + bool transposeB, c_size_view_t_ row_mapC, + c_lno_view_t &entriesC, c_scalar_view_t &valuesC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +//! Full specialization of block spgemm +// Unification layer +template +struct BSPGEMM_NUMERIC< + KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t, b_size_view_t_, + b_lno_view_t, b_scalar_view_t, c_size_view_t_, c_lno_view_t, + c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { + static void bspgemm_numeric( + KernelHandle *handle, typename KernelHandle::nnz_lno_t m, + typename KernelHandle::nnz_lno_t n, typename KernelHandle::nnz_lno_t k, + typename KernelHandle::const_nnz_lno_t blockDim, a_size_view_t_ row_mapA, + a_lno_view_t entriesA, a_scalar_view_t valuesA, + + bool transposeA, b_size_view_t_ row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, bool transposeB, c_size_view_t_ row_mapC, + c_lno_view_t &entriesC, c_scalar_view_t &valuesC) { + typedef typename KernelHandle::SPGEMMHandleType spgemmHandleType; + spgemmHandleType *sh = handle->get_spgemm_handle(); + if (!sh->is_symbolic_called()) { + throw std::runtime_error( + "Call spgemm symbolic before calling SpGEMM numeric"); + } + + switch (sh->get_algorithm_type()) { + case SPGEMM_CUSPARSE: + throw std::runtime_error( + "cuSPARSE implementation for block SpGEMM is not available"); + case SPGEMM_CUSP: + throw std::runtime_error( + "CUSP implementation for block SpGEMM is not available"); + case SPGEMM_MKL: + case SPGEMM_MKL2PHASE: + throw std::runtime_error( + "MKL implementation available for block SpGEMM is not available"); + case SPGEMM_VIENNA: + throw std::runtime_error( + "Vienna implementation available for block SpGEMM is not " + "available"); + + default: + + { + KokkosBSPGEMM + kbspgemm(handle, m, n, k, blockDim, row_mapA, entriesA, valuesA, + transposeA, row_mapB, entriesB, valuesB, transposeB); + kbspgemm.KokkosBSPGEMM_numeric(row_mapC, entriesC, valuesC); + } break; + case SPGEMM_SERIAL: + case SPGEMM_DEBUG: + bspgemm_debug_numeric(handle, m, n, k, blockDim, row_mapA, entriesA, + valuesA, transposeA, row_mapB, entriesB, valuesB, + transposeB, row_mapC, entriesC, valuesC); + break; + } + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + extern template struct BSPGEMM_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; \ + \ + extern template struct BSPGEMM_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_BSPGEMM_NUMERIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE) \ + template struct BSPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; \ + \ + template struct BSPGEMM_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, FAST_MEM_SPACE_TYPE, SLOW_MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +//#include +#include + +#endif // KOKKOS_BLAS1_MV_IMPL_DOT_HPP_ diff --git a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp index 60a00bd36a..bb95eea101 100644 --- a/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_cluster_gauss_seidel_impl.hpp @@ -337,9 +337,13 @@ class ClusterGaussSeidel { (teamMember.league_rank() * _clusters_per_team) + work; if (ii >= _color_set_end) return; - nnz_lno_t cluster = _color_adj(ii); - for (nnz_lno_t j = _cluster_offsets(cluster); - j < _cluster_offsets(cluster + 1); j++) { + nnz_lno_t cluster = _color_adj(ii); + nnz_lno_t clusterBegin = _cluster_offsets(cluster); + nnz_lno_t clusterEnd = _cluster_offsets(cluster + 1); + for (nnz_lno_t jcount = 0; jcount < clusterEnd - clusterBegin; + jcount++) { + nnz_lno_t j = _is_backward ? (clusterEnd - 1 - jcount) + : clusterBegin + jcount; nnz_lno_t row = _cluster_verts(j); nnz_lno_t num_vecs = _Xvector.extent(1); for (nnz_lno_t batch_start = 0; batch_start < num_vecs;) { @@ -352,14 +356,10 @@ class ClusterGaussSeidel { COL_BATCH_CASE(1) COL_BATCH_CASE(2) COL_BATCH_CASE(3) - COL_BATCH_CASE(4) - COL_BATCH_CASE(5) - COL_BATCH_CASE(6) - COL_BATCH_CASE(7) #undef COL_BATCH_CASE default: - runColBatch<8>(teamMember, row, batch_start); - batch_start += 8; + runColBatch<4>(teamMember, row, batch_start); + batch_start += 4; } } } @@ -561,6 +561,7 @@ class ClusterGaussSeidel { in_rowmap_t, in_colinds_t, rowmap_t, colinds_t, MyExecSpace>( num_rows, this->row_map, this->entries, sym_xadj, sym_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "SYMMETRIZING TIME: " << timer.seconds() << std::endl; timer.reset(); #endif @@ -607,6 +608,7 @@ class ClusterGaussSeidel { " is not implemented"); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Graph clustering: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -620,6 +622,7 @@ class ClusterGaussSeidel { raw_sym_xadj, raw_sym_adj, vertClusters, numClusters, clusterRowmap, clusterEntries, clusterOffsets, clusterVerts, false); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Building explicit cluster graph: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -668,6 +671,7 @@ class ClusterGaussSeidel { kh.destroy_graph_coloring_handle(); #endif #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "Coloring: " << timer.seconds() << '\n'; timer.reset(); #endif @@ -677,8 +681,8 @@ class ClusterGaussSeidel { typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( numClusters, numColors, colors, color_xadj, color_adj); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -798,8 +802,8 @@ class ClusterGaussSeidel { } gsHandle->set_inverse_diagonal(inverse_diagonal); gsHandle->set_call_numeric(true); - MyExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE + MyExecSpace().fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -861,7 +865,6 @@ class ClusterGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); } - MyExecSpace().fence(); } template @@ -894,7 +897,6 @@ class ClusterGaussSeidel { gs._clusters_per_team, team_size, vec_size), gs); - MyExecSpace().fence(); } } if (apply_backward) { @@ -913,7 +915,6 @@ class ClusterGaussSeidel { gs._clusters_per_team, team_size, vec_size), gs); - MyExecSpace().fence(); if (i == 0) { break; } @@ -945,7 +946,6 @@ class ClusterGaussSeidel { Kokkos::RangePolicy( 0, color_index_end - color_index_begin), gs); - MyExecSpace().fence(); } } if (apply_backward && numColors) { @@ -958,7 +958,6 @@ class ClusterGaussSeidel { Kokkos::RangePolicy( 0, color_index_end - color_index_begin), gs); - MyExecSpace().fence(); if (i == 0) { break; } diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 0f265dfbc4..137b75b3f7 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -46,13 +46,14 @@ #define _KOKKOSGSIMP_HPP #include "KokkosKernels_Utils.hpp" +#include "KokkosSparse_Utils.hpp" #include #include #include "KokkosGraph_Distance1Color.hpp" #include "KokkosKernels_Uniform_Initialized_MemoryPool.hpp" #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" // FOR DEBUGGING #include "KokkosBlas1_nrm2.hpp" @@ -62,7 +63,7 @@ namespace Impl { template + KokkosSparse::SparseMatrixFormat format = KokkosSparse::CRS> class PointGaussSeidel { public: typedef lno_row_view_t_ in_lno_row_view_t; @@ -136,7 +137,7 @@ class PointGaussSeidel { pool_memory_space; typedef - typename KokkosKernels::Impl::MatrixRowIndex + typename KokkosSparse::Impl::MatrixRowIndex RowIndex; private: @@ -979,8 +980,8 @@ class PointGaussSeidel { gsHandle->set_long_row_x(long_row_x); } else { // Just sort rows by ID. - KokkosKernels::sort_crs_graph(color_xadj, color_adj); + KokkosSparse::sort_crs_graph(color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE MyExecSpace().fence(); @@ -1104,7 +1105,7 @@ class PointGaussSeidel { // std::cout << "level_2_mem:" << level_2_mem << std::endl; size_type num_large_rows = 0; - KokkosKernels::Impl::kk_reduce_numrows_larger_than_threshold< + KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( brows, permuted_xadj, num_values_in_l1, num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( diff --git a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 182d33a2e7..5af78f96c5 100644 --- a/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/src/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -161,7 +161,7 @@ struct GAUSS_SEIDEL_SYMBOLIC { }; template < - class KernelHandle, KokkosKernels::SparseMatrixFormat format, + class KernelHandle, KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, @@ -180,7 +180,7 @@ struct GAUSS_SEIDEL_NUMERIC { a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template struct GAUSS_SEIDEL_NUMERIC struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -416,7 +416,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -435,7 +435,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -456,7 +456,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -481,7 +481,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BlockCRS, \ + KokkosSparse::BlockCRS, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ @@ -502,7 +502,7 @@ struct GAUSS_SEIDEL_APPLY, \ - KokkosKernels::BSR, \ + KokkosSparse::BSR, \ Kokkos::View, \ Kokkos::MemoryTraits >, \ diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp new file mode 100644 index 0000000000..b3008ff716 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -0,0 +1,306 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOS_SPADD_NUMERIC_IMPL_HPP +#define _KOKKOS_SPADD_NUMERIC_IMPL_HPP + +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_Sorting.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosSparse { +namespace Impl { + +template +struct SortedNumericSumFunctor { + using CscalarT = typename CvaluesT::non_const_value_type; + + SortedNumericSumFunctor(const ArowptrsT& Arowptrs_, + const BrowptrsT& Browptrs_, + const CrowptrsT& Crowptrs_, + const AcolindsT& Acolinds_, + const BcolindsT& Bcolinds_, + const CcolindsT& Ccolinds_, const AvaluesT& Avalues_, + const BvaluesT& Bvalues_, const CvaluesT& Cvalues_, + const AscalarT alpha_, const BscalarT beta_) + : Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Acolinds(Acolinds_), + Bcolinds(Bcolinds_), + Ccolinds(Ccolinds_), + Avalues(Avalues_), + Bvalues(Bvalues_), + Cvalues(Cvalues_), + alpha(alpha_), + beta(beta_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + size_type Coffset = Crowptrs(i); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + CscalarT accum = Kokkos::ArithTraits::zero(); + while (Acol == Ccol) { + accum += static_cast(alpha * Avalues(Arowstart + ai)); + ai++; + if (ai == Arowlen) + Acol = ORDINAL_MAX; + else + Acol = Acolinds(Arowstart + ai); + } + while (Bcol == Ccol) { + accum += static_cast(beta * Bvalues(Browstart + bi)); + bi++; + if (bi == Browlen) + Bcol = ORDINAL_MAX; + else + Bcol = Bcolinds(Browstart + bi); + } + Ccolinds(Coffset) = Ccol; + Cvalues(Coffset) = accum; + Coffset++; + } + } + + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const CrowptrsT Crowptrs; + const AcolindsT Acolinds; + const BcolindsT Bcolinds; + CcolindsT Ccolinds; + const AvaluesT Avalues; + const BvaluesT Bvalues; + CvaluesT Cvalues; + const AscalarT alpha; + const BscalarT beta; +}; + +template +struct UnsortedNumericSumFunctor { + using CscalarT = typename CvaluesT::non_const_value_type; + + UnsortedNumericSumFunctor( + const ArowptrsT Arowptrs_, const BrowptrsT Browptrs_, + const CrowptrsT Crowptrs_, const AcolindsT Acolinds_, + const BcolindsT Bcolinds_, CcolindsT Ccolinds_, const AvaluesT Avalues_, + const BvaluesT Bvalues_, CvaluesT Cvalues_, const AscalarT alpha_, + const BscalarT beta_, const CcolindsT Apos_, const CcolindsT Bpos_) + : Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Acolinds(Acolinds_), + Bcolinds(Bcolinds_), + Ccolinds(Ccolinds_), + Avalues(Avalues_), + Bvalues(Bvalues_), + Cvalues(Cvalues_), + alpha(alpha_), + beta(beta_), + Apos(Apos_), + Bpos(Bpos_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type CrowStart = Crowptrs(i); + size_type CrowEnd = Crowptrs(i + 1); + size_type ArowStart = Arowptrs(i); + size_type ArowEnd = Arowptrs(i + 1); + size_type BrowStart = Browptrs(i); + size_type BrowEnd = Browptrs(i + 1); + for (size_type j = CrowStart; j < CrowEnd; j++) + Cvalues(j) = Kokkos::ArithTraits::zero(); + // add in A entries, while setting C colinds + for (size_type j = ArowStart; j < ArowEnd; j++) { + Cvalues(CrowStart + Apos(j)) += alpha * Avalues(j); + Ccolinds(CrowStart + Apos(j)) = Acolinds(j); + } + // add in B entries, while setting C colinds + for (size_type j = BrowStart; j < BrowEnd; j++) { + Cvalues(CrowStart + Bpos(j)) += beta * Bvalues(j); + Ccolinds(CrowStart + Bpos(j)) = Bcolinds(j); + } + } + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const CrowptrsT Crowptrs; + const AcolindsT Acolinds; + const BcolindsT Bcolinds; + CcolindsT Ccolinds; + const AvaluesT Avalues; + const BvaluesT Bvalues; + CvaluesT Cvalues; + const AscalarT alpha; + const BscalarT beta; + const CcolindsT Apos; + const CcolindsT Bpos; +}; + +// Helper macro to check that two types are the same (ignoring const) +#define SAME_TYPE(A, B) \ + std::is_same::type, \ + typename std::remove_const::type>::value + +template +void spadd_numeric_impl( + KernelHandle* kernel_handle, const alno_row_view_t a_rowmap, + const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values, + const ascalar_t alpha, const blno_row_view_t b_rowmap, + const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values, + const bscalar_t beta, const clno_row_view_t c_rowmap, + clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) { + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_lno_t ordinal_type; + typedef typename KernelHandle::nnz_scalar_t scalar_type; + typedef + typename KernelHandle::SPADDHandleType::execution_space execution_space; + // Check that A/B/C data types match KernelHandle types, and that C data types + // are nonconst (doesn't matter if A/B types are const) + static_assert(SAME_TYPE(ascalar_t, scalar_type), + "A scalar type must match handle scalar type"); + static_assert(SAME_TYPE(bscalar_t, scalar_type), + "B scalar type must match handle scalar type"); + static_assert(SAME_TYPE(typename alno_row_view_t::value_type, size_type), + "add_symbolic: A size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert(SAME_TYPE(typename blno_row_view_t::value_type, size_type), + "add_symbolic: B size_type must match KernelHandle size_type " + "(const doesn't matter)"); + static_assert( + SAME_TYPE(typename clno_row_view_t::non_const_value_type, size_type), + "add_symbolic: C size_type must match KernelHandle size_type)"); + static_assert(SAME_TYPE(typename alno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: A entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(SAME_TYPE(typename blno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: B entry type must match KernelHandle entry type " + "(aka nnz_lno_t, and const doesn't matter)"); + static_assert(SAME_TYPE(typename clno_nnz_view_t::value_type, ordinal_type), + "add_symbolic: C entry type must match KernelHandle entry type " + "(aka nnz_lno_t)"); + static_assert(std::is_same::value, + "add_symbolic: C entry type must not be const"); + static_assert( + SAME_TYPE(typename ascalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: A scalar type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename bscalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: B scalar type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename cscalar_nnz_view_t::value_type, scalar_type), + "add_symbolic: C scalar type must match KernelHandle entry type (aka " + "nnz_lno_t)"); + static_assert(std::is_same::value, + "add_symbolic: C scalar type must not be const"); + typedef Kokkos::RangePolicy range_type; + auto addHandle = kernel_handle->get_spadd_handle(); + // rowmap length can be 0 or 1 if #rows is 0. + // Otherwise, it's always #rows+1. + if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { + addHandle->set_call_numeric(); + return; + } + ordinal_type nrows = a_rowmap.extent(0) - 1; + if (addHandle->is_input_sorted()) { + SortedNumericSumFunctor + sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, + c_entries, a_values, b_values, c_values, alpha, beta); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", + range_type(0, nrows), sortedNumeric); + } else { + // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C + // entries and values + UnsortedNumericSumFunctor + unsortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, + c_entries, a_values, b_values, c_values, alpha, beta, + addHandle->get_a_pos(), addHandle->get_b_pos()); + Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", + range_type(0, nrows), unsortedNumeric); + } + addHandle->set_call_numeric(); +} + +#undef SAME_TYPE + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp new file mode 100644 index 0000000000..7cc93e2715 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -0,0 +1,244 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPADD_NUMERIC_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosSparse_spadd_numeric_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spadd_numeric_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_numeric_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) + +template ::value, + bool eti_spec_avail = spadd_numeric_eti_spec_avail< + KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, + b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, + c_lno_view_t, c_scalar_view_t>::value> +struct SPADD_NUMERIC { + static void spadd_numeric(KernelHandle *handle, + typename a_scalar_view_t::const_value_type alpha, + a_size_view_t row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + typename b_scalar_view_t::const_value_type beta, + b_size_view_t row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, c_size_view_t row_mapC, + c_lno_view_t entriesC, c_scalar_view_t valuesC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct SPADD_NUMERIC { + static void spadd_numeric(KernelHandle *handle, + typename a_scalar_view_t::const_value_type alpha, + a_size_view_t row_mapA, a_lno_view_t entriesA, + a_scalar_view_t valuesA, + typename b_scalar_view_t::const_value_type beta, + b_size_view_t row_mapB, b_lno_view_t entriesB, + b_scalar_view_t valuesB, c_size_view_t row_mapC, + c_lno_view_t entriesC, c_scalar_view_t valuesC) { + spadd_numeric_impl(handle, row_mapA, entriesA, valuesA, alpha, row_mapB, + entriesB, valuesB, beta, row_mapC, entriesC, valuesC); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPADD_NUMERIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_SPADD_NUMERIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPADD_NUMERIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include +#include + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp new file mode 100644 index 0000000000..c4ae435f55 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -0,0 +1,635 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Brian Kelley (bmkelle@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP +#define _KOKKOS_SPADD_SYMBOLIC_IMPL_HPP + +#include "KokkosKernels_Handle.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "Kokkos_ArithTraits.hpp" + +namespace KokkosSparse { +namespace Impl { + +// Helper macro to check that two types are the same (ignoring const) +#define SAME_TYPE(A, B) \ + std::is_same::type, \ + typename std::remove_const::type>::value + +// get C rowmap for sorted input +template +struct SortedCountEntriesRange { + SortedCountEntriesRange(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const AColIndsT& Acolinds_, + const typename BRowPtrsT::const_type& Browptrs_, + const BColIndsT& Bcolinds_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowcounts(Crowcounts_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type numEntries = 0; + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + numEntries++; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + while (Acol == Ccol) + Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); + while (Bcol == Ccol) + Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); + } + Crowcounts(i) = numEntries; + } + + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const AColIndsT Acolinds; + const typename BRowPtrsT::const_type Browptrs; + const BColIndsT Bcolinds; + CRowPtrsT Crowcounts; +}; + +template +struct SortedCountEntriesTeam { + SortedCountEntriesTeam(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const AColIndsT& Acolinds_, + const typename BRowPtrsT::const_type& Browptrs_, + const BColIndsT& Bcolinds_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowcounts(Crowcounts_) {} + + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + + KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type numEntries = 0; + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + numEntries++; + // Eat all entries in both A and B which have this column + // This also results in Acol/Bcol being updated to following entries for + // next loop iter + while (Acol == Ccol) + Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); + while (Bcol == Ccol) + Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); + } + Crowcounts(i) = numEntries; + } + + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const { + ordinal_type i = t.league_rank() * t.team_size() + t.team_rank(); + if (i >= nrows) return; + ordinal_type* allScratch = + (ordinal_type*)t.team_shmem().get_shmem(totalShared); + ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread; + ordinal_type Arowstart = Arowptrs(i); + ordinal_type Arowlen = Arowptrs(i + 1) - Arowstart; + ordinal_type Browstart = Browptrs(i); + ordinal_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type n = Arowlen + Browlen; + if (n > sharedPerThread) { + // fall back to slow serial method + Kokkos::single(Kokkos::PerThread(t), [&]() { longRowFallback(i); }); + return; + } + if (n == 0) { + Kokkos::single(Kokkos::PerThread(t), [&]() { Crowcounts(i) = 0; }); + return; + } + // Figure out the number of bitonic steps: ceil(log2(n)) + ordinal_type npot = 1; + ordinal_type levels = 0; + while (npot < n) { + levels++; + npot <<= 1; + } + // Copy A and B entries to scratch + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(t, Arowlen), + [&](ordinal_type j) { scratch[j] = Acolinds(Arowstart + j); }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen), + [&](ordinal_type j) { + scratch[npot - 1 - j] = Bcolinds(Browstart + j); + }); + // Fill space between A and B with ORDINAL_MAX, + // to maintain a valid bitonic sequence of power-of-two length + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(t, npot - n), [&](ordinal_type j) { + scratch[Arowlen + j] = Kokkos::ArithTraits::max(); + }); + // npot = 2^levels + for (ordinal_type level = 0; level < levels; level++) { + // npot/2 pairs of items are compared in parallel + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1), + [&](const ordinal_type j) { + ordinal_type boxSize = npot >> level; + // Which box contains this thread? + // box = (j / boxSize), and boxSize = + // 2^(levels-level), so box = j * 2^(level-levels) + // = j >> (levels - level) + ordinal_type boxID = (j * 2) >> (levels - level); + // boxStart = boxID * boxSize = boxID * + // 2^(levels-level) = boxID << (levels-level) + ordinal_type boxStart = boxID << (levels - level); + ordinal_type boxOffset = j - boxID * boxSize / 2; + ordinal_type elem1 = boxStart + boxOffset; + ordinal_type elem2 = elem1 + (boxSize >> 1); + if (scratch[elem2] < scratch[elem1]) { + ordinal_type temp = scratch[elem1]; + scratch[elem1] = scratch[elem2]; + scratch[elem2] = temp; + } + }); + } + // Finally, count the number of distinct entries (this is #rising edges + 1) + ordinal_type risingEdges; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(t, n - 1), + [&](const ordinal_type j, ordinal_type& lcount) { + if (scratch[j] != scratch[j + 1]) lcount++; + }, + risingEdges); + Kokkos::single(Kokkos::PerThread(t), + [&]() { Crowcounts(i) = risingEdges + 1; }); + } + + size_t team_shmem_size(int teamSize) const { + return sharedPerThread * sizeof(ordinal_type) * teamSize; + } + + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const AColIndsT Acolinds; + const typename BRowPtrsT::const_type Browptrs; + const BColIndsT Bcolinds; + CRowPtrsT Crowcounts; + int sharedPerThread; // Shared for each thread, measured in + // sizeof(ordinal_type) + int totalShared; // Shared for whole team, measured in bytes +}; + +// get upper bound for C entries per row (assumes worst case, that entries in A +// and B on each row are disjoint) +template +struct UnsortedEntriesUpperBound { + UnsortedEntriesUpperBound(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const typename BRowPtrsT::const_type& Browptrs_, + const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowcounts(Crowcounts_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + Crowcounts(i) = + (Arowptrs(i + 1) - Arowptrs(i)) + (Browptrs(i + 1) - Browptrs(i)); + if (i == nrows - 1) { + // last workitem also zeros the one-past-end entry of row counts, so + // that prefix sum is correct + Crowcounts(nrows) = 0; + } + } + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const typename BRowPtrsT::const_type Browptrs; + CRowPtrsT Crowcounts; +}; + +// Unsorted symbolic: new functors: +// -compute uncompressed C (entries only, no values) +// -sort uncompressed C entries within row, while permuting A union B +// permutation array -compress sorted C entries and A,B perm arrays at the same +// time, which produces Crowcounts value +// Inputs: A, B rowptrs/colinds, C uncompressed rowptrs (and allocated C +// entries) Output: C uncompressed colinds +template +struct UnmergedSumFunctor { + UnmergedSumFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, + const AcolindsT& Acolinds_, const BrowptrsT& Browptrs_, + const BcolindsT& Bcolinds_, const CrowptrsT& Crowptrs_, + const CcolindsT& Ccolinds_, const CcolindsT& ABperm_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowptrs(Crowptrs_), + Ccolinds(Ccolinds_), + ABperm(ABperm_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type inserted = 0; + size_type crowstart = Crowptrs(i); + size_type arowstart = Arowptrs(i); + size_type arowlen = Arowptrs(i + 1) - arowstart; + size_type browstart = Browptrs(i); + size_type browlen = Browptrs(i + 1) - browstart; + // Insert all A entries, then all B entries + for (size_type j = 0; j < arowlen; j++) { + Ccolinds(crowstart + inserted) = Acolinds(arowstart + j); + ABperm(crowstart + inserted) = j; + inserted++; + } + for (size_type j = 0; j < browlen; j++) { + Ccolinds(crowstart + inserted) = Bcolinds(browstart + j); + // tell A and B permutation values apart by adding arowlen as a bias to B + // values + ABperm(crowstart + inserted) = j + arowlen; + inserted++; + } + } + ordinal_type nrows; + const ArowptrsT Arowptrs; + const AcolindsT Acolinds; + const BrowptrsT Browptrs; + const BcolindsT Bcolinds; + const CrowptrsT Crowptrs; + CcolindsT Ccolinds; + CcolindsT ABperm; +}; + +template +struct MergeEntriesFunctor { + MergeEntriesFunctor(ordinal_type nrows_, const ArowptrsT& Arowptrs_, + const BrowptrsT& Browptrs_, const OffsetView& Crowptrs_, + const CrowptrsT& Crowcounts_, const CcolindsT& Ccolinds_, + const CcolindsT& ABperm_, const CcolindsT& Apos_, + const CcolindsT& Bpos_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Browptrs(Browptrs_), + Crowptrs(Crowptrs_), + Crowcounts(Crowcounts_), + Ccolinds(Ccolinds_), + ABperm(ABperm_), + Apos(Apos_), + Bpos(Bpos_) {} + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { + size_type CrowStart = Crowptrs(i); + size_type CrowEnd = Crowptrs(i + 1); + if (CrowEnd == CrowStart) { + Crowcounts(i) = 0; + return; + } + size_type ArowStart = Arowptrs(i); + size_type ArowNum = Arowptrs(i + 1) - ArowStart; + size_type BrowStart = Browptrs(i); + ordinal_type CFit = 0; // counting through merged C indices (within row) + for (size_type Cit = CrowStart; Cit < CrowEnd; Cit++) { + if ((Cit > CrowStart) && (Ccolinds(Cit) != Ccolinds(Cit - 1))) { + // This is a different column than the previous entry, and is not the + // first entry. This means that this is the first occurence of a unique + // column. + CFit++; + } + size_type permVal = ABperm(Cit); + if (permVal < ArowNum) { + // Entry belongs to A + ordinal_type Aindex = permVal; + // The Aindex'th entry in row i of A will be added into the CFit'th + // entry in C + Apos(ArowStart + Aindex) = CFit; + } else { + // Entry belongs to B + ordinal_type Bindex = permVal - ArowNum; + // The Bindex'th entry in row i of B will be added into the CFit'th + // entry in C + Bpos(BrowStart + Bindex) = CFit; + } + } + // At end of the row, know how many entries are in merged C. + // Right now, CFit is the index of the last Apos/Bpos, + // so adding one gives the total number of entries. + Crowcounts(i) = CFit + 1; + } + ordinal_type nrows; + const ArowptrsT Arowptrs; + const BrowptrsT Browptrs; + const OffsetView Crowptrs; + CrowptrsT Crowcounts; + CcolindsT Ccolinds; + const CcolindsT ABperm; + CcolindsT Apos; + CcolindsT Bpos; +}; + +// Run SortedCountEntries: non-GPU, always uses the RangePolicy version. +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = + nullptr) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = + typename KernelHandle::SPADDHandleType::execution_space; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + range_type(0, nrows), countEntries); +} + +// Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending +// on average nz per row (a runtime decision) +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = + nullptr) { + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = + typename KernelHandle::SPADDHandleType::execution_space; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; + size_type c_est_nnz = + 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; + if (c_est_nnz <= 512) { + // Convert c_est_nnz to a power of 2 + size_type pot_est_nnz = 1; + while (pot_est_nnz < c_est_nnz) pot_est_nnz *= 2; + // Estimate max number of uncompressed entries in each row of C + int vector_length = 1; + int vector_length_max = + KokkosKernels::Impl::kk_get_max_vector_size(); + while (vector_length * 2 <= vector_length_max && + (size_type)vector_length * 2 <= pot_est_nnz) { + vector_length *= 2; + } + SortedCountEntriesTeam + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + countEntries.sharedPerThread = pot_est_nnz; + // compute largest possible team size + TeamPol testPolicy(1, 1, vector_length); + testPolicy.set_scratch_size( + 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + int team_size = testPolicy.team_size_recommended(countEntries, + Kokkos::ParallelForTag()); + // construct real policy + int league_size = (nrows + team_size - 1) / team_size; + TeamPol policy(league_size, team_size, vector_length); + policy.set_scratch_size( + 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + countEntries.totalShared = + countEntries.sharedPerThread * team_size * sizeof(ordinal_type); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", policy, + countEntries); + } else { + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + RangePol(0, nrows), countEntries); + } +} + +// Symbolic: count entries in each row in C to produce rowmap +// kernel handle has information about whether it is sorted add or not. +template +void spadd_symbolic_impl( + KernelHandle* handle, const alno_row_view_t_ a_rowmap, + const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, + const blno_nnz_view_t_ b_entries, + clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't + // need to be initialized) +{ + typedef + typename KernelHandle::SPADDHandleType::execution_space execution_space; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_lno_t ordinal_type; + typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t; + typedef typename KernelHandle::SPADDHandleType::nnz_row_view_t offset_view_t; + // Check that A/B/C data types match KernelHandle types, and that C data types + // are nonconst (doesn't matter if A/B types are const) + static_assert( + SAME_TYPE(typename alno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: A size_type must match KernelHandle size_type (const " + "doesn't matter)"); + static_assert( + SAME_TYPE(typename blno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: B size_type must match KernelHandle size_type (const " + "doesn't matter)"); + static_assert( + SAME_TYPE(typename clno_row_view_t_::non_const_value_type, size_type), + "add_symbolic: C size_type must match KernelHandle size_type)"); + static_assert(std::is_same::value, + "add_symbolic: C size_type must not be const"); + static_assert( + SAME_TYPE(typename alno_nnz_view_t_::non_const_value_type, ordinal_type), + "add_symbolic: A entry type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert( + SAME_TYPE(typename blno_nnz_view_t_::non_const_value_type, ordinal_type), + "add_symbolic: B entry type must match KernelHandle entry type (aka " + "nnz_lno_t, and const doesn't matter)"); + static_assert(std::is_same::value, + "add_symbolic: C entry type must not be const"); + // symbolic just needs to compute c_rowmap + // easy for sorted, but for unsorted is easiest to just compute the whole sum + auto addHandle = handle->get_spadd_handle(); + if (a_rowmap.extent(0) == 0 || a_rowmap.extent(0) == 1) { + // Have 0 rows, so nothing to do except set #nnz to 0 + addHandle->set_c_nnz(0); + // If c_rowmap has a single entry, it must be 0 + if (c_rowmap.extent(0)) Kokkos::deep_copy(c_rowmap, (size_type)0); + addHandle->set_call_symbolic(); + return; + } + ordinal_type nrows = a_rowmap.extent(0) - 1; + typedef Kokkos::RangePolicy range_type; + if (addHandle->is_input_sorted()) { + runSortedCountEntries( + a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap); + } else { + // note: scoping individual parts of the process to free views sooner, + // minimizing peak memory usage run the unsorted c_rowmap upper bound + // functor (just adds together A and B entry counts row by row) + offset_view_t c_rowmap_upperbound( + Kokkos::view_alloc(Kokkos::WithoutInitializing, + "C row counts upper bound"), + nrows + 1); + size_type c_nnz_upperbound = 0; + { + UnsortedEntriesUpperBound + countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", + range_type(0, nrows), countEntries); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap_upperbound); + Kokkos::deep_copy(c_nnz_upperbound, + Kokkos::subview(c_rowmap_upperbound, nrows)); + } + ordinal_view_t c_entries_uncompressed( + Kokkos::view_alloc(Kokkos::WithoutInitializing, + "C entries uncompressed"), + c_nnz_upperbound); + ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, + "A and B permuted entry indices"), + c_nnz_upperbound); + // compute the unmerged sum + UnmergedSumFunctor + unmergedSum(nrows, a_rowmap, a_entries, b_rowmap, b_entries, + c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", + range_type(0, nrows), unmergedSum); + // sort the unmerged sum + KokkosSparse::sort_crs_matrix( + c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + ordinal_view_t a_pos( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), + a_entries.extent(0)); + ordinal_view_t b_pos( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), + b_entries.extent(0)); + // merge the entries and compute Apos/Bpos, as well as Crowcounts + { + MergeEntriesFunctor + mergeEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound, c_rowmap, + c_entries_uncompressed, ab_perm, a_pos, b_pos); + Kokkos::parallel_for( + "KokkosSparse::SpAdd:Symbolic::InputNotSorted::MergeEntries", + range_type(0, nrows), mergeEntries); + // compute actual c_rowmap + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( + nrows + 1, c_rowmap); + } + addHandle->set_a_b_pos(a_pos, b_pos); + } + // provide the number of NNZ in C to user through handle + size_type cmax; + Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); + addHandle->set_c_nnz(cmax); + addHandle->set_call_symbolic(); + addHandle->set_call_numeric(false); +} + +#undef SAME_TYPE + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp new file mode 100644 index 0000000000..7a48999e6a --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -0,0 +1,189 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPADD_SYMBOLIC_SPEC_HPP_ + +#include + +#include +#include "KokkosKernels_Handle.hpp" +// Include the actual functors +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include "KokkosSparse_spadd_symbolic_impl.hpp" +#endif + +namespace KokkosSparse { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spadd_symbolic_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_symbolic_eti_spec_avail< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +// Include the actual specialization declarations +#include +#include + +namespace KokkosSparse { +namespace Impl { + +// Unification layer +/// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) + +template ::value, + bool eti_spec_avail = spadd_symbolic_eti_spec_avail< + KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t, + b_lno_view_t, c_size_view_t>::value> +struct SPADD_SYMBOLIC { + static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, + a_lno_view_t entriesA, b_size_view_t row_mapB, + b_lno_view_t entriesB, c_size_view_t row_mapC); +}; + +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + +template +struct SPADD_SYMBOLIC { + static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, + a_lno_view_t entriesA, b_size_view_t row_mapB, + b_lno_view_t entriesB, c_size_view_t row_mapC) { + spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB, + row_mapC); + } +}; + +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPADD_SYMBOLIC< \ + typename KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPADD_SYMBOLIC< \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + false, true>; + +#include +#include + +#endif diff --git a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp index e566e8bf06..c6a24e2163 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_imp_outer.hpp @@ -509,7 +509,7 @@ struct KokkosSPGEMM #include -#include +#include #include #include #include @@ -282,7 +274,7 @@ class KokkosSPGEMM { typedef Kokkos::TeamPolicy > dynamic_team_policy_t; - private: + protected: HandleType *handle; nnz_lno_t a_row_cnt; nnz_lno_t b_row_cnt; @@ -795,7 +787,7 @@ class KokkosSPGEMM { typename c_scalar_nnz_view_t::const_value_type omega, dinv_view_t dinv, KokkosKernels::Impl::ExecSpaceType lcl_my_exec_space); - private: + protected: template void KokkosSPGEMM_jacobi_denseacc( diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp index ce3501c447..32492482fe 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_seq.hpp @@ -95,17 +95,17 @@ void spgemm_debug_symbolic(KernelHandle *handle, lno_t row_size = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - // scalar_t val = h_vala(ind); + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + // scalar_t val = h_vala(a_ind); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - // scalar_t b_val = h_valb(ind_); + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + // scalar_t b_val = h_valb(b_ind); // if (i == 0) std::cout << "\tb col:" << b_col << std::endl; if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -194,16 +194,16 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, lno_t c_row_size_counter = 0; for (lno_t j = 0; j < a_row_size; ++j) { - size_type ind = a_row_begin + j; - lno_t col = h_enta(ind); - scalar_t val = h_vala(ind); + size_type a_ind = a_row_begin + j; + lno_t col = h_enta(a_ind); + scalar_t val = h_vala(a_ind); const size_type b_row_begin = h_rmb(col); const size_type b_row_end = h_rmb(col + 1); lno_t b_row_size = b_row_end - b_row_begin; for (lno_t z = 0; z < b_row_size; ++z) { - size_type ind_ = b_row_begin + z; - lno_t b_col = h_entb(ind_); - scalar_t b_val = h_valb(ind_); + size_type b_ind = b_row_begin + z; + lno_t b_col = h_entb(b_ind); + scalar_t b_val = h_valb(b_ind); if (acc_flag[b_col] == false) { acc_flag[b_col] = true; @@ -216,9 +216,9 @@ void spgemm_debug_numeric(KernelHandle * /* handle */, // if (i == 0) std::cout << "result_cols" << std::endl; for (lno_t j = 0; j < c_row_size; ++j) { - size_type ind = c_row_begin + j; - lno_t result_col = h_entc(ind); - h_valc(ind) = accumulator[result_col]; + size_type c_ind = c_row_begin + j; + lno_t result_col = h_entc(c_ind); + h_valc(c_ind) = accumulator[result_col]; accumulator[result_col] = 0; acc_flag[result_col] = false; } diff --git a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp index bc185c0cd1..847d765cb4 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_impl_speed.hpp @@ -156,9 +156,9 @@ struct KokkosSPGEMM -#include namespace KokkosSparse { namespace Impl { @@ -302,6 +300,11 @@ void mkl2phase_symbolic( (void)transposeA; (void)transposeB; (void)verbose; + (void)a_xadj; + (void)b_xadj; + (void)c_xadj; + (void)a_adj; + (void)b_adj; #endif } else { @@ -351,9 +354,7 @@ void mkl2phase_apply( typename KernelHandle::HandlePersistentMemorySpace; using int_persistent_work_view_t = typename Kokkos::View; - using MyExecSpace = typename KernelHandle::HandleExecSpace; - using value_type = typename KernelHandle::nnz_scalar_t; - using idx = typename KernelHandle::nnz_lno_t; + using idx = typename KernelHandle::nnz_lno_t; if (std::is_same::value) { int *a_xadj = (int *)row_mapA.data(); @@ -639,6 +640,11 @@ void mkl2phase_apply( (void)transposeA; (void)transposeB; (void)verbose; + (void)a_xadj; + (void)b_xadj; + (void)c_xadj; + (void)a_adj; + (void)b_adj; #endif // __INTEL_MKL__ == 2018 && __INTEL_MKL_UPDATE__ >= 2 } else { (void)m; diff --git a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp index 8eb0bd3930..9a6ab70f9e 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_mkl_impl.hpp @@ -45,637 +45,270 @@ #ifndef _KOKKOSSPGEMMMKL_HPP #define _KOKKOSSPGEMMMKL_HPP +#include "KokkosKernels_config.h" +#include "KokkosSparse_Utils_mkl.hpp" + #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include "mkl_spblas.h" -#include "mkl.h" -#endif - -#include "KokkosKernels_Utils.hpp" -#include namespace KokkosSparse { - namespace Impl { -template -void mkl_symbolic(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - in_row_index_view_type row_mapA, - in_nonzero_index_view_type entriesA, - - bool transposeA, bin_row_index_view_type row_mapB, - bin_nonzero_index_view_type entriesB, bool transposeB, - cin_row_index_view_type row_mapC, bool verbose = false) { -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +// multiplies two sparse MKL matrices and returns sparse MKL matrix +template +inline static MKLSparseMatrix mkl_spmm( + sparse_operation_t operation, const MKLSparseMatrix &A, + const MKLSparseMatrix &B) { + sparse_matrix_t C; + KOKKOSKERNELS_MKL_SAFE_CALL(mkl_sparse_spmm(operation, A, B, &C)); + return MKLSparseMatrix(C); +} - typedef typename KernelHandle::nnz_lno_t idx; +template +class MKL_SPGEMM { + public: + typedef typename KernelHandle::nnz_lno_t nnz_lno_t; typedef typename KernelHandle::size_type size_type; - - typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; - typedef typename Kokkos::View - int_temp_work_view_t; - typedef typename KernelHandle::nnz_scalar_t value_type; - typedef typename KernelHandle::HandleExecSpace MyExecSpace; - /* - if (!( - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) ) - ){ - throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for - MKL\n"); return; - } - */ - if (std::is_same::value) { - int *a_xadj = NULL; - int *b_xadj = NULL; - int_temp_work_view_t a_xadj_v, b_xadj_v; - - if (std::is_same::value) { - a_xadj = (int *)row_mapA.data(); - b_xadj = (int *)row_mapB.data(); - } else { - // TODO test this case. - - Kokkos::Timer copy_time; - const int max_integer = 2147483647; - if (entriesB.extent(0) > max_integer || - entriesA.extent(0) > max_integer) { - throw std::runtime_error( - "MKL requires integer values for size type for SPGEMM. Copying to " - "integer will cause overflow.\n"); - return; - } - a_xadj_v = int_temp_work_view_t("tmpa", m + 1); - a_xadj = (int *)a_xadj_v.data(); - b_xadj_v = int_temp_work_view_t("tmpb", n + 1); - b_xadj = (int *)b_xadj_v.data(); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapA, a_xadj_v); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapB, b_xadj_v); - - if (verbose) - std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() - << std::endl; + typedef typename Kokkos::View int_tmp_view_t; + + public: + static void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, + nnz_lno_t k, a_rowmap_view_type row_mapA, + a_index_view_type entriesA, bool transposeA, + b_rowmap_view_type row_mapB, + b_index_view_type entriesB, bool transposeB, + c_rowmap_view_type row_mapC, bool verbose = false) { + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) { + // set correct values in non-empty 0-nnz corner case + handle->set_c_nnz(0); + Kokkos::deep_copy(row_mapC, 0); + return; } - int *a_adj = (int *)entriesA.data(); - int *b_adj = (int *)entriesB.data(); - - std::vector tmp_values( - KOKKOSKERNELS_MACRO_MAX(entriesB.extent(0), entriesA.extent(0))); - value_type *ptmp_values = &(tmp_values[0]); - value_type *a_ew = ptmp_values; - value_type *b_ew = ptmp_values; - - sparse_matrix_t A; - sparse_matrix_t B; - sparse_matrix_t C; - - if (std::is_same::value) { - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (float *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (float *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } - - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } - - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual FLOAT MKL SPMM Time in symbolic:" - << timer1.seconds() << std::endl; + Kokkos::Timer timer; + using scalar_t = typename KernelHandle::nnz_scalar_t; - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - float *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_row_index_view_type::non_const_type, - MyExecSpace>(m, rows_start, row_mapC); - idx nnz = row_mapC(m) = rows_end[m - 1]; + const auto export_rowmap = [&](MKL_INT num_rows, MKL_INT *rows_start, + MKL_INT * /*columns*/, + scalar_t * /*values*/) { + if (handle->mkl_keep_output) { + Kokkos::Timer copy_time; + const nnz_lno_t nnz = rows_start[num_rows]; handle->set_c_nnz(nnz); + copy(make_host_view(rows_start, num_rows + 1), row_mapC); + if (verbose) + std::cout << "\tMKL rowmap export time:" << copy_time.seconds() + << std::endl; } + }; - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } - } else if (std::is_same::value) { - /* - std::cout << "create a" << std::endl; - std::cout << "m:" << m << " n:" << n << std::endl; - std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << - std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << - " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; - */ - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (double *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - // std::cout << "create b" << std::endl; - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (double *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } + // use dummy values for A and B inputs + a_values_view_type tmp_valsA( + Kokkos::ViewAllocateWithoutInitializing("tmp_valuesA"), + entriesA.extent(0)); + b_values_view_type tmp_valsB( + Kokkos::ViewAllocateWithoutInitializing("tmp_valuesB"), + entriesB.extent(0)); - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } + spmm(handle, m, n, k, row_mapA, entriesA, tmp_valsA, transposeA, row_mapB, + entriesB, tmp_valsB, transposeB, verbose, export_rowmap); - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" - << timer1.seconds() << std::endl; - mkl_free_buffers(); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() - << std::endl; - - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - double *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - if (handle->mkl_keep_output) { - Kokkos::Timer copy_time; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_row_index_view_type::non_const_type, - MyExecSpace>(m, rows_start, row_mapC); - idx nnz = row_mapC(m) = rows_end[m - 1]; - handle->set_c_nnz(nnz); - - double copy_time_d = copy_time.seconds(); - if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl; - } - } + if (verbose) + std::cout << "MKL symbolic time:" << timer.seconds() << std::endl; + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } + static void mkl_numeric( + KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_view_type row_mapA, a_index_view_type entriesA, + a_values_view_type valuesA, bool transposeA, b_rowmap_view_type row_mapB, + b_index_view_type entriesB, b_values_view_type valuesB, bool transposeB, + c_rowmap_view_type /* row_mapC */, c_index_view_type entriesC, + c_values_view_type valuesC, bool verbose = false) { + Kokkos::Timer timer; + + const auto export_values = + [&](MKL_INT num_rows, MKL_INT *rows_start, MKL_INT *columns, + typename KernelHandle::nnz_scalar_t *values) { + if (handle->mkl_keep_output) { + Kokkos::Timer copy_time; + const nnz_lno_t nnz = rows_start[num_rows]; + copy(make_host_view(columns, nnz), entriesC); + copy(make_host_view(values, nnz), valuesC); + if (verbose) + std::cout << "\tMKL values export time:" << copy_time.seconds() + << std::endl; + } + }; + + spmm(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, row_mapB, + entriesB, valuesB, transposeB, verbose, export_values); + + if (verbose) + std::cout << "MKL numeric time:" << timer.seconds() << std::endl; + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } + private: + template + static void spmm(KernelHandle * /* handle */, nnz_lno_t m, nnz_lno_t n, + nnz_lno_t k, a_rowmap_view_type row_mapA, + a_index_view_type entriesA, a_values_view_type valuesA, + + bool transposeA, b_rowmap_view_type row_mapB, + b_index_view_type entriesB, b_values_view_type valuesB, + bool transposeB, bool verbose, const CB &callback) { + if (!std::is_same::value) { + throw std::runtime_error("MKL requires local ordinals to be integer.\n"); + } - } else { - throw std::runtime_error( - "MKL requires float or double values. Complex values are not " - "implemented yet.\n"); + if (m < 1 || n < 1 || k < 1 || entriesA.extent(0) < 1 || + entriesB.extent(0) < 1) { return; } - } else { - throw std::runtime_error("MKL requires local ordinals to be integer.\n"); - return; - } -#else - (void)handle; - (void)m; - (void)n; - (void)k; - (void)row_mapA; - (void)row_mapB; - (void)row_mapC; - (void)entriesA; - (void)entriesB; - (void)transposeA; - (void)transposeB; - (void)verbose; - throw std::runtime_error("MKL IS NOT DEFINED\n"); - // return; -#endif -} -template < - typename KernelHandle, typename in_row_index_view_type, - typename in_nonzero_index_view_type, typename in_nonzero_value_view_type, - typename bin_row_index_view_type, typename bin_nonzero_index_view_type, - typename bin_nonzero_value_view_type, typename cin_row_index_view_type, - typename cin_nonzero_index_view_type, typename cin_nonzero_value_view_type> -void mkl_apply(KernelHandle *handle, typename KernelHandle::nnz_lno_t m, - typename KernelHandle::nnz_lno_t n, - typename KernelHandle::nnz_lno_t k, - in_row_index_view_type row_mapA, - in_nonzero_index_view_type entriesA, - in_nonzero_value_view_type valuesA, - - bool transposeA, bin_row_index_view_type row_mapB, - bin_nonzero_index_view_type entriesB, - bin_nonzero_value_view_type valuesB, bool transposeB, - cin_row_index_view_type row_mapC, - cin_nonzero_index_view_type entriesC, - cin_nonzero_value_view_type valuesC, bool verbose = false) { -#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + const auto create_mirror = [](auto view) { + return Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); + }; - typedef typename KernelHandle::nnz_lno_t idx; - typedef typename KernelHandle::size_type size_type; + auto h_rowsA = create_mirror(row_mapA); + auto h_rowsB = create_mirror(row_mapB); + const int *a_xadj = reinterpret_cast(h_rowsA.data()); + const int *b_xadj = reinterpret_cast(h_rowsB.data()); + int_tmp_view_t a_xadj_v, b_xadj_v; - typedef typename KernelHandle::HandleTempMemorySpace HandleTempMemorySpace; - typedef typename Kokkos::View - int_temp_work_view_t; - - typedef typename KernelHandle::nnz_scalar_t value_type; - - typedef typename KernelHandle::HandleExecSpace MyExecSpace; - /* - if (!( - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) && - (Kokkos::SpaceAccessibility::accessible) ) - ){ - throw std::runtime_error ("MEMORY IS NOT ALLOCATED IN HOST DEVICE for - MKL\n"); return; - } - */ - if (std::is_same::value) { - int *a_xadj = NULL; - int *b_xadj = NULL; - int_temp_work_view_t a_xadj_v, b_xadj_v; - - if (std::is_same::value) { - a_xadj = (int *)row_mapA.data(); - b_xadj = (int *)row_mapB.data(); - } else { - // TODO test this case. - - Kokkos::Timer copy_time; - const int max_integer = 2147483647; - if (entriesB.extent(0) > max_integer || - entriesA.extent(0) > max_integer) { + if (!std::is_same::value) { + if (entriesA.extent(0) > INT_MAX || entriesB.extent(0) > INT_MAX) { throw std::runtime_error( - "MKL requires integer values for size type for SPGEMM. Copying to " + "MKL requires integer values for size type for SPGEMM. Copying " + "to " "integer will cause overflow.\n"); - return; } - a_xadj_v = int_temp_work_view_t("tmpa", m + 1); - a_xadj = (int *)a_xadj_v.data(); - b_xadj_v = int_temp_work_view_t("tmpb", n + 1); - b_xadj = (int *)b_xadj_v.data(); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapA, a_xadj_v); - - KokkosKernels::Impl::copy_vector( - m + 1, row_mapB, b_xadj_v); + static_assert( + std::is_same::value, + "deep_copy requires non-const destination type"); + Kokkos::Timer copy_time; + a_xadj_v = int_tmp_view_t("tmpa", m + 1); + b_xadj_v = int_tmp_view_t("tmpb", n + 1); + Kokkos::deep_copy(a_xadj_v, h_rowsA); + Kokkos::deep_copy(b_xadj_v, h_rowsB); + a_xadj = (int *)a_xadj_v.data(); + b_xadj = (int *)b_xadj_v.data(); if (verbose) - std::cout << "MKL COPY size type to int TIME:" << copy_time.seconds() - << std::endl; + std::cout << "\tMKL int-type temp rowmap copy time:" + << copy_time.seconds() << std::endl; } - int *a_adj = (int *)entriesA.data(); - int *b_adj = (int *)entriesB.data(); - - const value_type *a_ew = valuesA.data(); - const value_type *b_ew = valuesB.data(); - - sparse_matrix_t A; - sparse_matrix_t B; - sparse_matrix_t C; - - if (std::is_same::value) { - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (float *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (float *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } - - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } - - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual FLOAT MKL SPMM Time:" << timer1.seconds() - << std::endl; - - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - float *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_s_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - - // KokkosKernels::Impl::copy_vector (m, rows_start, - // row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1]; - idx nnz = rows_end[m - 1]; - using non_const_size_type = - typename cin_row_index_view_type::non_const_value_type; - auto *tmpPtr = const_cast(row_mapC.data()); - tmpPtr[m] = nnz; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_nonzero_index_view_type::non_const_type, - MyExecSpace>(nnz, columns, entriesC); - KokkosKernels::Impl::copy_vector< - float *, typename cin_nonzero_value_view_type::non_const_type, - MyExecSpace>(nnz, values, valuesC); - } - - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } - - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } - } else if (std::is_same::value) { - /* - std::cout << "create a" << std::endl; - std::cout << "m:" << m << " n:" << n << std::endl; - std::cout << "a_xadj[0]:" << a_xadj[0] << " a_xadj[m]:" << a_xadj[m] << - std::endl; std::cout << "a_adj[a_xadj[m] - 1]:" << a_adj[a_xadj[m] - 1] << - " a_ew[a_xadj[m] - 1]:" << a_ew[a_xadj[m] - 1] << std::endl; - */ - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&A, SPARSE_INDEX_BASE_ZERO, m, n, a_xadj, - a_xadj + 1, a_adj, (double *)a_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr A matrix\n"); - return; - } - - // std::cout << "create b" << std::endl; - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_create_csr(&B, SPARSE_INDEX_BASE_ZERO, n, k, b_xadj, - b_xadj + 1, b_adj, (double *)b_ew)) { - throw std::runtime_error( - "CANNOT CREATE mkl_sparse_s_create_csr B matrix\n"); - return; - } - - sparse_operation_t operation; - if (transposeA && transposeB) { - operation = SPARSE_OPERATION_TRANSPOSE; - } else if (!(transposeA || transposeB)) { - operation = SPARSE_OPERATION_NON_TRANSPOSE; - } else { - throw std::runtime_error( - "MKL either transpose both matrices, or none for SPGEMM\n"); - return; - } - - Kokkos::Timer timer1; - bool success = - SPARSE_STATUS_SUCCESS != mkl_sparse_spmm(operation, A, B, &C); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time Without Free:" - << timer1.seconds() << std::endl; + auto h_valsA = create_mirror(valuesA); + auto h_valsB = create_mirror(valuesB); + auto h_entriesA = create_mirror(entriesA); + auto h_entriesB = create_mirror(entriesB); + const int *a_adj = reinterpret_cast(h_entriesA.data()); + const int *b_adj = reinterpret_cast(h_entriesB.data()); + const value_type *a_ew = h_valsA.data(); + const value_type *b_ew = h_valsB.data(); + + // Hack: we discard const with pointer casts here to work around MKL + // requiring mutable input and our symbolic interface not providing it + using Matrix = MKLSparseMatrix; + Matrix A(m, n, (int *)a_xadj, (int *)a_adj, (value_type *)a_ew); + Matrix B(n, k, (int *)b_xadj, (int *)b_adj, (value_type *)b_ew); + + sparse_operation_t operation; + if (transposeA && transposeB) { + operation = SPARSE_OPERATION_TRANSPOSE; + } else if (!(transposeA || transposeB)) { + operation = SPARSE_OPERATION_NON_TRANSPOSE; + } else { + throw std::runtime_error( + "MKL either transpose both matrices, or none for SPGEMM\n"); + } - mkl_free_buffers(); - if (verbose) - std::cout << "Actual DOUBLE MKL SPMM Time:" << timer1.seconds() - << std::endl; + Kokkos::Timer timer1; + Matrix C = mkl_spmm(operation, A, B); + if (verbose) { + std::cout << "\tMKL spmm ("; + if (std::is_same::value) + std::cout << "FLOAT"; + else if (std::is_same::value) + std::cout << "DOUBLE"; + else + std::cout << "?"; + std::cout << ") time:" << timer1.seconds() << std::endl; + } - if (success) { - throw std::runtime_error( - "ERROR at SPGEMM multiplication in mkl_sparse_spmm\n"); - return; - } else { - sparse_index_base_t c_indexing; - MKL_INT c_rows, c_cols, *rows_start, *rows_end, *columns; - double *values; - - if (SPARSE_STATUS_SUCCESS != - mkl_sparse_d_export_csr(C, &c_indexing, &c_rows, &c_cols, - &rows_start, &rows_end, &columns, - &values)) { - throw std::runtime_error( - "ERROR at exporting result matrix in mkl_sparse_spmm\n"); - return; - } - - if (SPARSE_INDEX_BASE_ZERO != c_indexing) { - throw std::runtime_error("C is not zero based indexed\n"); - return; - } - if (handle->mkl_keep_output) { - Kokkos::Timer copy_time; - - // KokkosKernels::Impl::copy_vector (m, - // rows_start, row_mapC); idx nnz = row_mapC(m) = rows_end[m - 1]; - idx nnz = rows_end[m - 1]; - using non_const_size_type = - typename cin_row_index_view_type::non_const_value_type; - auto *tmpPtr = const_cast(row_mapC.data()); - tmpPtr[m] = nnz; - - KokkosKernels::Impl::copy_vector< - MKL_INT *, typename cin_nonzero_index_view_type::non_const_type, - MyExecSpace>(nnz, columns, entriesC); - KokkosKernels::Impl::copy_vector< - double *, typename cin_nonzero_value_view_type::non_const_type, - MyExecSpace>(nnz, values, valuesC); - double copy_time_d = copy_time.seconds(); - if (verbose) std::cout << "MKL COPYTIME:" << copy_time_d << std::endl; - } - } + MKL_INT num_rows, num_cols, *rows_start, *columns; + value_type *values; + C.export_data(num_rows, num_cols, rows_start, columns, values); + callback(m, rows_start, columns, values); - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(A)) { - throw std::runtime_error("Error at mkl_sparse_destroy A\n"); - return; - } + A.destroy(); + B.destroy(); + C.destroy(); + } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(B)) { - throw std::runtime_error("Error at mkl_sparse_destroy B\n"); - return; - } - if (SPARSE_STATUS_SUCCESS != mkl_sparse_destroy(C)) { - throw std::runtime_error("Error at mkl_sparse_destroy C\n"); - return; - } + template + inline static void copy(from_view_type from, dst_view_type to) { + auto h_from = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), from); + auto h_to = Kokkos::create_mirror_view(Kokkos::HostSpace(), to); + Kokkos::deep_copy(h_to, h_from); // view copy (for different element types) + Kokkos::deep_copy(to, h_to); + } - } else { - throw std::runtime_error( - "MKL requires float or double values. Complex values are not " - "implemented yet.\n"); - return; - } - } else { - throw std::runtime_error("MKL requires local ordinals to be integer.\n"); - return; + template > + inline static view_type make_host_view(const T *data, size_t num_elems) { + return view_type(data, num_elems); } -#else - (void)handle; - (void)m; - (void)n; - (void)k; - (void)row_mapA; - (void)row_mapB; - (void)row_mapC; - (void)entriesA; - (void)entriesB; - (void)entriesC; - (void)valuesA; - (void)valuesB; - (void)valuesC; - (void)transposeA; - (void)transposeB; - (void)verbose; - throw std::runtime_error("MKL IS NOT DEFINED\n"); - // return; -#endif +}; + +template +void mkl_symbolic(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_type row_mapA, a_index_type entriesA, + bool transposeA, b_rowmap_type row_mapB, + b_index_type entriesB, bool transposeB, + c_rowmap_type row_mapC, bool verbose = false) { + using values_type = typename KernelHandle::scalar_temp_work_view_t; + using c_index_type = b_index_type; + using mkl = MKL_SPGEMM; + mkl::mkl_symbolic(handle, m, n, k, row_mapA, entriesA, transposeA, row_mapB, + entriesB, transposeB, row_mapC, verbose); } + +template +void mkl_numeric(KernelHandle *handle, nnz_lno_t m, nnz_lno_t n, nnz_lno_t k, + a_rowmap_type row_mapA, a_index_type entriesA, + a_values_type valuesA, bool transposeA, b_rowmap_type row_mapB, + b_index_type entriesB, b_values_type valuesB, bool transposeB, + c_rowmap_type row_mapC, c_index_type entriesC, + c_values_type valuesC, bool verbose = false) { + using mkl = + MKL_SPGEMM; + mkl::mkl_numeric(handle, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, + valuesC, verbose); +} + } // namespace Impl } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_MKL +#endif // _KOKKOSSPGEMMMKL_HPP diff --git a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp index beb969fc77..24008d3b26 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_numeric_spec.hpp @@ -211,23 +211,6 @@ struct SPGEMM_NUMERIC< if (!sh->is_symbolic_called()) { throw std::runtime_error( "Call spgemm symbolic before calling SpGEMM numeric"); - /* - KokkosSparse::Experimental::spgemm_symbolic( - handle, m, n, k, - row_mapA, entriesA, transposeA, - row_mapB, entriesB, transposeB, - row_mapC - ); - typename c_size_view_t_::value_type c_nnz_size = - handle->get_spgemm_handle()->get_c_nnz(); if (c_nnz_size){ entriesC = - c_lno_view_t (Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); valuesC = c_scalar_view_t - (Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - } - */ } switch (sh->get_algorithm_type()) { @@ -245,9 +228,13 @@ struct SPGEMM_NUMERIC< transposeB, row_mapC, entriesC, valuesC); break; case SPGEMM_MKL: - mkl_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, - row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, - valuesC, handle->get_verbose()); +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + mkl_numeric(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, + row_mapB, entriesB, valuesB, transposeB, row_mapC, entriesC, + valuesC, handle->get_verbose()); +#else + throw std::runtime_error("MKL was not enabled in this build!"); +#endif break; case SPGEMM_MKL2PHASE: mkl2phase_apply(sh, m, n, k, row_mapA, entriesA, valuesA, transposeA, diff --git a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp index 181984ebe9..d83ae6767c 100644 --- a/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spgemm_symbolic_spec.hpp @@ -179,9 +179,13 @@ struct SPGEMM_SYMBOLICget_verbose()); break; +#else + throw std::runtime_error("MKL was not enabled in this build!"); +#endif } sh->set_call_symbolic(); } diff --git a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp index d0b80ace69..4af8606dfb 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_numeric_impl.hpp @@ -242,52 +242,54 @@ struct ILUKLvlSchedTP1NumericFunctor { KOKKOS_INLINE_FUNCTION void operator()(const member_type &team) const { - auto my_league = team.league_rank(); // map to rowid - auto rowid = level_idx(my_league + lev_start); - auto my_team = team.team_rank(); + nnz_lno_t my_team = static_cast(team.league_rank()); + nnz_lno_t rowid = + static_cast(level_idx(my_team + lev_start)); // map to rowid - auto k1 = L_row_map(rowid); - auto k2 = L_row_map(rowid + 1); + size_type k1 = static_cast(L_row_map(rowid)); + size_type k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), [&](const size_type k) { - auto col = L_entries(k); - L_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); #else Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = L_entries(k); - L_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(L_entries(k)); + L_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); #endif #ifdef KEEP_DIAG - if (my_team == 0) L_values(k2 - 1) = scalar_t(1.0); + // if (my_thread == 0) L_values(k2 - 1) = scalar_t(1.0); + Kokkos::single(Kokkos::PerTeam(team), + [&]() { L_values(k2 - 1) = scalar_t(1.0); }); #endif team.team_barrier(); - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = U_entries(k); - U_values(k) = 0.0; - iw(my_league, col) = k; + nnz_lno_t col = static_cast(U_entries(k)); + U_values(k) = 0.0; + iw(my_team, col) = static_cast(k); }); team.team_barrier(); // Unpack the ith row of A - k1 = A_row_map(rowid); - k2 = A_row_map(rowid + 1); + k1 = static_cast(A_row_map(rowid)); + k2 = static_cast(A_row_map(rowid + 1)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), [&](const size_type k) { - auto col = A_entries(k); - auto ipos = iw(my_league, col); + nnz_lno_t col = static_cast(A_entries(k)); + nnz_lno_t ipos = iw(my_team, col); if (col < rowid) L_values(ipos) = A_values(k); else @@ -297,20 +299,22 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); // Eliminate prev rows - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - for (auto k = k1; k < k2 - 1; ++k) { + for (size_type k = k1; k < k2 - 1; k++) #else - for (auto k = k1; k < k2; ++k) { + for (size_type k = k1; k < k2; k++) #endif - auto prev_row = L_entries(k); + { + nnz_lno_t prev_row = L_entries(k); #ifdef KEEP_DIAG - auto fact = L_values(k) / U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) / U_values(U_row_map(prev_row)); #else - auto fact = L_values(k) * U_values(U_row_map(prev_row)); + scalar_t fact = L_values(k) * U_values(U_row_map(prev_row)); #endif - if (my_team == 0) L_values(k) = fact; + // if (my_thread == 0) L_values(k) = fact; + Kokkos::single(Kokkos::PerTeam(team), [&]() { L_values(k) = fact; }); team.team_barrier(); @@ -318,10 +322,10 @@ struct ILUKLvlSchedTP1NumericFunctor { Kokkos::TeamThreadRange(team, U_row_map(prev_row) + 1, U_row_map(prev_row + 1)), [&](const size_type kk) { - auto col = U_entries(kk); - auto ipos = iw(my_league, col); + nnz_lno_t col = static_cast(U_entries(kk)); + nnz_lno_t ipos = iw(my_team, col); + auto lxu = -U_values(kk) * fact; if (ipos != -1) { - auto lxu = -U_values(kk) * fact; if (col < rowid) Kokkos::atomic_add(&L_values(ipos), lxu); else @@ -332,40 +336,49 @@ struct ILUKLvlSchedTP1NumericFunctor { team.team_barrier(); } // end for k - if (my_team == 0) { + // if (my_thread == 0) { + Kokkos::single(Kokkos::PerTeam(team), [&]() { + nnz_lno_t ipos = iw(my_team, rowid); #ifdef KEEP_DIAG - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } #else - if (U_values(iw(my_league, rowid)) == 0.0) { - U_values(iw(my_league, rowid)) = 1e6; + if (U_values(ipos) == 0.0) { + U_values(ipos) = 1e6; } else { - U_values(iw(my_league, rowid)) = 1.0 / U_values(iw(my_league, rowid)); + U_values(ipos) = 1.0 / U_values(ipos); } #endif - } + }); + //} team.team_barrier(); // Reset - k1 = L_row_map(rowid); - k2 = L_row_map(rowid + 1); + k1 = static_cast(L_row_map(rowid)); + k2 = static_cast(L_row_map(rowid + 1)); #ifdef KEEP_DIAG - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2 - 1), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2 - 1), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #else - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, L_entries(k)) = -1; }); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(L_entries(k)); + iw(my_team, col) = -1; + }); #endif - k1 = U_row_map(rowid); - k2 = U_row_map(rowid + 1); - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, k1, k2), - [&](const size_type k) { iw(my_league, U_entries(k)) = -1; }); + k1 = static_cast(U_row_map(rowid)); + k2 = static_cast(U_row_map(rowid + 1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, k1, k2), + [&](const size_type k) { + nnz_lno_t col = static_cast(U_entries(k)); + iw(my_team, col) = -1; + }); } }; @@ -379,23 +392,17 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, LValuesType &L_values, const URowMapType &U_row_map, const UEntriesType &U_entries, UValuesType &U_values) { using execution_space = typename IlukHandle::execution_space; - using memory_space = typename IlukHandle::memory_space; using size_type = typename IlukHandle::size_type; using nnz_lno_t = typename IlukHandle::nnz_lno_t; using HandleDeviceEntriesType = typename IlukHandle::nnz_lno_view_t; - using WorkViewType = - Kokkos::View>; - using LevelHostViewType = Kokkos::View; + using WorkViewType = typename IlukHandle::work_view_t; + using LevelHostViewType = typename IlukHandle::nnz_lno_view_host_t; size_type nlevels = thandle.get_num_levels(); - size_type nrows = thandle.get_nrows(); // Keep these as host View, create device version and copy back to host - HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); - HandleDeviceEntriesType level_idx = thandle.get_level_idx(); - HandleDeviceEntriesType level_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType level_nrowsperchunk = - thandle.get_level_nrowsperchunk(); + HandleDeviceEntriesType level_ptr = thandle.get_level_ptr(); + HandleDeviceEntriesType level_idx = thandle.get_level_idx(); // Make level_ptr_h a separate allocation, since it will be accessed on host // between kernel launches. If a mirror were used and level_ptr is in UVM @@ -409,25 +416,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, level_ptr.extent(0)); Kokkos::deep_copy(level_ptr_h, level_ptr); + //{ if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_nchunks_h = LevelHostViewType( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Host level nchunks"), - level_nchunks.extent(0)); - level_nrowsperchunk_h = - LevelHostViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, - "Host level nrowsperchunk"), - level_nrowsperchunk.extent(0)); - Kokkos::deep_copy(level_nchunks_h, level_nchunks); - Kokkos::deep_copy(level_nrowsperchunk_h, level_nrowsperchunk); - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrowsperchunk(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); - } else { - iw = WorkViewType(Kokkos::view_alloc(Kokkos::WithoutInitializing, "iw"), - thandle.get_level_maxrows(), nrows); - Kokkos::deep_copy(iw, nnz_lno_t(-1)); + level_nchunks_h = thandle.get_level_nchunks(); + level_nrowsperchunk_h = thandle.get_level_nrowsperchunk(); } + iw = thandle.get_iw(); // Main loop must be performed sequential. Question: Try out Cuda's graph // stuff to reduce kernel launch overhead @@ -476,49 +471,13 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, else Kokkos::parallel_for("parfor_l_team", policy_type(lvl_nrows_chunk, team_size), tstf); - + Kokkos::fence(); lvl_rowid_start += lvl_nrows_chunk; } } - // /* - // // TP2 algorithm has issues with some offset-ordinal combo to be - // addressed else if ( thandle.get_algorithm() == - // KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { - // typedef Kokkos::TeamPolicy tvt_policy_type; - // - // int team_size = thandle.get_team_size(); - // if ( team_size == -1 ) { - // team_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 128; - // } - // int vector_size = thandle.get_team_size(); - // if ( vector_size == -1 ) { - // vector_size = std::is_same< typename - // Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace - // >::value ? 1 : 4; - // } - // - // // This impl: "chunk" lvl_nodes into node_groups; a league_rank - // is responsible for processing that many nodes - // // TeamThreadRange over number of node_groups - // // To avoid masking threads, 1 thread (team) per node in - // node_group - // // ThreadVectorRange responsible for the actual solve - // computation const int node_groups = team_size; - // - // LowerTriLvlSchedTP2SolverFunctor - // tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, - // row_count, node_groups); - // Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( - // (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, - // vector_size ), tstf); - // } // end elseif - // */ - } // end if } // end for lvl + //} // Output check #ifdef NUMERIC_OUTPUT_INFO @@ -526,7 +485,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzL: " << thandle.get_nnzL() << std::endl; std::cout << " L_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << L_row_map(i) << " "; } std::cout << std::endl; @@ -545,7 +504,7 @@ void iluk_numeric(IlukHandle &thandle, const ARowMapType &A_row_map, std::cout << " nnzU: " << thandle.get_nnzU() << std::endl; std::cout << " U_row_map = "; - for (size_type i = 0; i < nrows + 1; ++i) { + for (size_type i = 0; i < thandle.get_nrows() + 1; ++i) { std::cout << U_row_map(i) << " "; } std::cout << std::endl; diff --git a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp index 90bb88e057..691d624963 100644 --- a/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spiluk_symbolic_impl.hpp @@ -121,15 +121,15 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, // SEQLVLSCHD_TP1 algorithm (chunks) template -void level_sched(IlukHandle& thandle, const RowMapType row_map, - const EntriesType entries, LevelType1& level_list, - LevelType2& level_ptr, LevelType2& level_idx, - LevelType3& level_nchunks, LevelType3& level_nrowsperchunk, - size_type& nlevels) { + class LevelType1, class LevelType2, class size_type> +void level_sched_tp(IlukHandle& thandle, const RowMapType row_map, + const EntriesType entries, LevelType1& level_list, + LevelType2& level_ptr, LevelType2& level_idx, + size_type& nlevels) { // Scheduling currently compute on host - using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_t = typename IlukHandle::nnz_lno_t; + using nnz_lno_view_host_t = typename IlukHandle::nnz_lno_view_host_t; size_type nrows = thandle.get_nrows(); @@ -168,11 +168,10 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, level_ptr(0) = 0; // Find max rows, number of chunks, max rows of chunks across levels - using HostViewType = - Kokkos::View; - - HostViewType lnchunks("lnchunks", nlevels); - HostViewType lnrowsperchunk("lnrowsperchunk", nlevels); + thandle.alloc_level_nchunks(nlevels); + thandle.alloc_level_nrowsperchunk(nlevels); + nnz_lno_view_host_t lnchunks = thandle.get_level_nchunks(); + nnz_lno_view_host_t lnrowsperchunk = thandle.get_level_nrowsperchunk(); #ifdef KOKKOS_ENABLE_CUDA using memory_space = typename IlukHandle::memory_space; @@ -214,9 +213,6 @@ void level_sched(IlukHandle& thandle, const RowMapType row_map, thandle.set_num_levels(nlevels); thandle.set_level_maxrows(maxrows); thandle.set_level_maxrowsperchunk(maxrowsperchunk); - - level_nchunks = lnchunks; - level_nrowsperchunk = lnrowsperchunk; } // Linear Search for the smallest row index @@ -326,7 +322,6 @@ void iluk_symbolic(IlukHandle& thandle, HostTmpViewType h_iw("h_iw", nrows); HostTmpViewType h_iL("h_iL", nrows); HostTmpViewType h_llev("h_llev", nrows); - HostTmpViewType level_nchunks, level_nrowsperchunk; size_type cntL = 0; size_type cntU = 0; @@ -472,19 +467,13 @@ void iluk_symbolic(IlukHandle& thandle, // Level scheduling on L if (thandle.get_algorithm() == KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1) { - level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, - level_idx, level_nchunks, level_nrowsperchunk, nlev); - - thandle.alloc_level_nchunks(nlev); - thandle.alloc_level_nrowsperchunk(nlev); - HandleDeviceEntriesType dlevel_nchunks = thandle.get_level_nchunks(); - HandleDeviceEntriesType dlevel_nrowsperchunk = - thandle.get_level_nrowsperchunk(); - Kokkos::deep_copy(dlevel_nchunks, level_nchunks); - Kokkos::deep_copy(dlevel_nrowsperchunk, level_nrowsperchunk); + level_sched_tp(thandle, L_row_map, L_entries, level_list, level_ptr, + level_idx, nlev); + thandle.alloc_iw(thandle.get_level_maxrowsperchunk(), nrows); } else { level_sched(thandle, L_row_map, L_entries, level_list, level_ptr, level_idx, nlev); + thandle.alloc_iw(thandle.get_level_maxrows(), nrows); } Kokkos::deep_copy(dlevel_ptr, level_ptr); diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp index 7132ec0fe1..14b75f1c39 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -101,10 +101,10 @@ struct spmv_mv_blockcrsmatrix_eti_spec_avail { const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const *, LAYOUT_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR_TYPE *, LAYOUT_TYPE, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits > { \ enum : bool { value = true }; \ diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index b87a9fa460..313098372a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -46,6 +46,7 @@ #define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ #include "KokkosKernels_Error.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" #if defined(KOKKOS_ENABLE_CUDA) && \ (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE)) @@ -320,10 +321,8 @@ struct BsrMatrixSpMVTensorCoreFunctor { // no barrier - each warp uses independent shared memory // load from the shared memory -#ifdef __CUDA_ARCH__ load_matrix_sync(fy, &sy(warpIdx_y, warpIdx_x, 0, 0), FRAG_N, nvcuda::wmma::mem_row_major); -#endif auto rowView = a.block_row_Const(blockIdx_i); @@ -363,17 +362,12 @@ struct BsrMatrixSpMVTensorCoreFunctor { const AOrdinal bj = bk + tj; // fill shmem with 0 outside of the block boundary -#ifdef __CUDA_ARCH__ if (bi < a.blockDim() && bj < a.blockDim()) { sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(alpha * ap[bi * a.blockDim() + bj]); } else { sa(ti / FRAG_M, ti % FRAG_M, tj) = AFragScalar(0); } -#else - (void)bi; - (void)bj; -#endif } // collaborative load of X fragments into shared memory @@ -391,7 +385,6 @@ struct BsrMatrixSpMVTensorCoreFunctor { // load 0 outside of the block boundary // x is not necessarily a multiple of block size, so make sure access // is in bounds -#ifdef __CUDA_ARCH__ if (bi < a.blockDim() && bj < a.blockDim() && unsigned(blockIdx_j * a.blockDim() + bj) < x.extent(1)) { // tile is some fragments in the j/n direction that are frag_n wide @@ -400,15 +393,10 @@ struct BsrMatrixSpMVTensorCoreFunctor { } else { sx(tj / FRAG_N, ti, tj % FRAG_N) = XFragScalar(0); } -#else - (void)bi; - (void)bj; -#endif } mbr.team_barrier(); // load correct fragment from shared memory and accumulate -#ifdef __CUDA_ARCH__ // only need to do any math if our fragment will write a result back to // Y if (ay_i < static_cast(y.extent(0)) && @@ -417,17 +405,12 @@ struct BsrMatrixSpMVTensorCoreFunctor { load_matrix_sync(fx, &sx(warpIdx_x, 0, 0), FRAG_N); mma_sync(fy, fa, fx, fy); } -#endif } - (void)j; - (void)ap; } // loop through blocks in row of A -#ifdef __CUDA_ARCH__ // store Y fragments into shared memory store_matrix_sync(&sy(warpIdx_y, warpIdx_x, 0, 0), fy, FRAG_N, nvcuda::wmma::mem_row_major); -#endif // team loads its fragments of Y that make up part or all of the block of Y // it's responsible for. each warp loads the part corresponding to its y // fragment @@ -447,21 +430,16 @@ struct BsrMatrixSpMVTensorCoreFunctor { } } mbr.team_barrier(); - - // Suppress unused var warnings - // TODO (@cwpearson): Should this functor only compile on device? - (void)fx; - (void)fa; - (void)fy; } }; -/* Instantiate some common template parameter values - for BsrMatrixSpMVTensorCoreFunctor. - This is a struct instead of a function for template...using shorthand - Discriminates between complex (supported) and non-complex (unsupported) - scalar types, and throws a runtime error for unsupported types -*/ +/// \brief Avoid instantiating tensor core functor for unsupported types +/// +/// Instantiate some common template parameter values +/// for BsrMatrixSpMVTensorCoreFunctor. +/// This is a struct instead of a function for template...using shorthand +/// Discriminates between non-complex/on-GPU (supported) and otherwise +/// (unsupported) scalar types, and throws a runtime error for unsupported types template struct none_complex { const static bool value = !Kokkos::ArithTraits::is_complex && @@ -528,11 +507,22 @@ struct BsrMatrixSpMVTensorCoreDispatcher { !Kokkos::ArithTraits::is_complex; }; + /*true if T1::execution_space, T2, or T3 are all GPU exec space*/ + template + struct all_gpu { + const static bool value = KokkosKernels::Impl::kk_is_gpu_exec_space() && + KokkosKernels::Impl::kk_is_gpu_exec_space() && + KokkosKernels::Impl::kk_is_gpu_exec_space(); + }; + static void dispatch(YScalar alpha, AMatrix a, XMatrix x, YScalar beta, YMatrix y) { - using tag = - std::integral_constant::value>; + // tag will be false unless all conditions are met + using tag = std::integral_constant< + bool, none_complex::value && + all_gpu::value>; tag_dispatch(tag{}, alpha, a, x, beta, y); } }; @@ -552,7 +542,7 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosBatched_Gemv_TeamVector_Internal.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" #include "KokkosBatched_Gemm_TeamVector_Internal.hpp" -#include "KokkosBatched_Scale_Internal.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" #include "KokkosBlas2_team_gemv_spec.hpp" @@ -649,7 +639,7 @@ struct BSR_GEMV_Functor { const y_value_type val_one = Kokkos::ArithTraits::one(); ; if (beta != val_one) { - KokkosBatched::TeamVectorScaleInternal::invoke( + KokkosBlas::Impl::TeamVectorScaleInternal::invoke( dev, block_dim, beta, Y_cur.data(), static_cast(Y_cur.stride_0())); } @@ -979,6 +969,8 @@ struct BSR_GEMV_Transpose_Functor { Kokkos::atomic_add(&Y_cur(ijk), shared_view(ijk)); }); + // + dev.team_barrier(); } } else { for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { @@ -1008,6 +1000,8 @@ struct BSR_GEMV_Transpose_Functor { [&](const ordinal_type &ijk) { Kokkos::atomic_add(&Y_cur(ijk), shared_y[ijk]); }); + // + dev.team_barrier(); } } } @@ -1282,7 +1276,7 @@ struct BSR_GEMM_Functor { const y_value_type val_one = Kokkos::ArithTraits::one(); if (beta != val_one) { - KokkosBatched::TeamVectorScaleInternal::invoke( + KokkosBlas::Impl::TeamVectorScaleInternal::invoke( dev, block_dim, num_rhs, beta, Y_cur.data(), static_cast(Y_cur.stride_0()), static_cast(Y_cur.stride_1())); diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 4d6d6cd1b5..52bbb2f839 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -201,88 +201,125 @@ struct SPMV_MV_BSRMATRIX YVector; typedef typename YVector::non_const_value_type YScalar; + enum class Method { + Fallback, ///< Don't use tensor cores + TensorCores ///< use tensor cores + }; + + /// Precision to use in the tensor core implementation + enum class Precision { + Automatic, ///< Use Double, unless operations match mixed precision + Double, ///< fp64 += fp64 * fp64 + Mixed ///< fp32 += fp16 * fp16 + }; + static void spmv_mv_bsrmatrix( const KokkosKernels::Experimental::Controls &controls, const char mode[], const YScalar &alpha, const AMatrix &A, const XVector &X, const YScalar &beta, const YVector &Y) { #if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) - // user explicitly requests a particular precision - bool requestMixed = false; - bool requestDouble = false; - if (controls.isParameter("tc_precision")) { - if (controls.getParameter("tc_precision") == "mixed") { - requestMixed = true; - } else if (controls.getParameter("tc_precision") == "double") { - requestDouble = true; - } - } - // - bool use_tc = false; - if ((controls.isParameter("algorithm")) && - (controls.getParameter("algorithm") == "experimental_bsr_tc")) { - if (Kokkos::Details::ArithTraits::is_complex == false) - use_tc = true; + Method method = Method::Fallback; + { + typedef typename AMatrix::non_const_value_type AScalar; + typedef typename XVector::non_const_value_type XScalar; + // try to use tensor cores if requested + if (controls.getParameter("algorithm") == "experimental_bsr_tc") + method = Method::TensorCores; + // can't use tensor cores for complex + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + if (Kokkos::Details::ArithTraits::is_complex) + method = Method::Fallback; + // can't use tensor cores outside GPU + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename AMatrix::execution_space>()) + method = Method::Fallback; + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename XVector::execution_space>()) + method = Method::Fallback; + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename YVector::execution_space>()) + method = Method::Fallback; + // can't use tensor cores unless mode is no-transpose + if (mode[0] != KokkosSparse::NoTranspose[0]) method = Method::Fallback; +#if KOKKOS_HALF_T_IS_FLOAT + // disable tensor cores when Kokkos half is actually a float + method = Method::Fallback; +#endif // KOKKOS_HALF_T_IS_FLOAT } -#endif +#endif // AMPERE || VOLTA #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_AMPERE) - typedef typename XVector::non_const_value_type XScalar; - typedef typename AMatrix::non_const_value_type AScalar; - typedef Kokkos::Experimental::half_t Half; - - /* Ampere has double += double * double and float += half * half - - use whichever is requested. - If none requested, used mixed precision if the inputs are mixed, otherwise - use double - */ - - // input precision matches a tensor core fragment type - constexpr bool operandsHalfHalfFloat = std::is_same::value && - std::is_same::value && - std::is_same::value; - - if (use_tc) { - if (requestMixed) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - return; - } else if (requestDouble) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); - return; - } else if (operandsHalfHalfFloat) { + { + typedef Kokkos::Experimental::half_t Half; + typedef typename AMatrix::non_const_value_type AScalar; + typedef typename XVector::non_const_value_type XScalar; + + /* Ampere has double += double * double and float += half * half + + use whichever is requested. + If none requested, used mixed precision if the inputs are mixed, otherwise + use double + */ + if (Method::TensorCores == method) { + Precision precision = Precision::Automatic; + if (controls.getParameter("tc_precision") == "mixed") + precision = Precision::Mixed; + else if (controls.getParameter("tc_precision") == "double") + precision = Precision::Double; + + switch (precision) { + case Precision::Mixed: { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + case Precision::Double: { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + case Precision::Automatic: // fallthrough + default: { + constexpr bool operandsHalfHalfFloat = + std::is_same::value && + std::is_same::value && + std::is_same::value; + if (operandsHalfHalfFloat) { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } else { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, + Y); + return; + } + } + } + } + } +#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA) + { + /* Volta has float += half * half + use it for all matrices + */ + if (Method::TensorCores == method) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); return; - } else { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); - return; - } - } -#elif defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ARCH_VOLTA) - /* Volta has float += half * half - use it for all matrices - */ - if (use_tc) { - if (requestDouble) { - KokkosKernels::Impl::throw_runtime_exception( - "KokkosSparse::spmv[algorithm=experimental_bsr_tc] " - "tc_precision=double unsupported KOKKOS_ARCH_VOLTA"); } - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - (void)requestMixed; // unused - return; } #endif // KOKKOS_ARCH diff --git a/src/sparse/impl/KokkosSparse_spmv_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_impl.hpp index 41843d8674..fcd02a851e 100644 --- a/src/sparse/impl/KokkosSparse_spmv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_impl.hpp @@ -94,7 +94,7 @@ struct SPMV_Transpose_Functor { AMatrix m_A; XVector m_x; YVector m_y; - ordinal_type rows_per_team; + ordinal_type rows_per_team = 0; SPMV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const YVector& m_y_) @@ -725,7 +725,7 @@ struct SPMV_MV_Transpose_Functor { YVector m_y; const ordinal_type n; - ordinal_type rows_per_team; + ordinal_type rows_per_team = 0; SPMV_MV_Transpose_Functor(const coefficient_type& alpha_, const AMatrix& m_A_, const XVector& m_x_, const coefficient_type& beta_, diff --git a/src/sparse/impl/KokkosSparse_spmv_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_spec.hpp index e0fdb2b6cd..cc29d72b77 100644 --- a/src/sparse/impl/KokkosSparse_spmv_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_spec.hpp @@ -111,6 +111,8 @@ struct spmv_mv_eti_spec_avail { // Include the actual specialization declarations #include #include + +#include #include namespace KokkosSparse { @@ -204,7 +206,8 @@ struct SPMV_MV { typedef Kokkos::View YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& controls, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y); }; @@ -261,7 +264,8 @@ struct SPMV_MV YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { typedef Kokkos::Details::ArithTraits KAT; @@ -287,7 +291,8 @@ struct SPMV_MV YVector; typedef typename YVector::non_const_value_type coefficient_type; - static void spmv_mv(const char mode[], const coefficient_type& alpha, + static void spmv_mv(const KokkosKernels::Experimental::Controls& /*controls*/, + const char mode[], const coefficient_type& alpha, const AMatrix& A, const XVector& x, const coefficient_type& beta, const YVector& y) { static_assert(std::is_integral::value, @@ -377,6 +382,8 @@ struct SPMV_MV #include + +#include #include #endif // KOKKOSSPARSE_IMPL_SPMV_SPEC_HPP_ diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 7943b1e602..fbee2fb33f 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -1141,8 +1141,7 @@ struct UpperTriSupernodalFunctor { KokkosBatched::TeamTrsm< member_type, KokkosBatched::Side::Left, KokkosBatched::Uplo::Lower, KokkosBatched::Trans::Transpose, KokkosBatched::Diag::NonUnit, - KokkosBatched::Algo::Trsm::Unblocked>::template invoke(team, one, - Ujj, Xjj); + KokkosBatched::Algo::Trsm::Unblocked>::invoke(team, one, Ujj, Xjj); } team.team_barrier(); } diff --git a/src/sparse/impl/KokkosSparse_trsv_impl.hpp b/src/sparse/impl/KokkosSparse_trsv_impl.hpp index f076368827..bff037c228 100644 --- a/src/sparse/impl/KokkosSparse_trsv_impl.hpp +++ b/src/sparse/impl/KokkosSparse_trsv_impl.hpp @@ -218,6 +218,7 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, typename CrsMatrixType::row_map_type ptr = A.graph.row_map; typename CrsMatrixType::index_type ind = A.graph.entries; typename CrsMatrixType::values_type val = A.values; + typedef Kokkos::Details::ArithTraits STS; // If local_ordinal_type is unsigned and numRows is 0, the loop // below will have entirely the wrong number of iterations. @@ -232,15 +233,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, for (local_ordinal_type j = 0; j < numVecs; ++j) { X(r, j) = Y(r, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - // We assume the diagonal entry is first in the row. - const matrix_scalar_type A_rr = val(beg); - for (offset_type k = beg + static_cast(1); k < end; ++k) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + matrix_scalar_type A_rr = STS::zero(); + for (offset_type k = beg; k < end; ++k) { const matrix_scalar_type A_rc = val(k); const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + if (r == c) { + A_rr += A_rc; + } else { + for (local_ordinal_type j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } } } // for each entry A_rc in the current row r for (local_ordinal_type j = 0; j < numVecs; ++j) { @@ -254,15 +258,18 @@ void upperTriSolveCsr(RangeMultiVectorType X, const CrsMatrixType& A, for (local_ordinal_type j = 0; j < numVecs; ++j) { X(r, j) = Y(r, j); } - const offset_type beg = ptr(r); - const offset_type end = ptr(r + 1); - // We assume the diagonal entry is first in the row. - const matrix_scalar_type A_rr = val(beg); - for (offset_type k = beg + 1; k < end; ++k) { + const offset_type beg = ptr(r); + const offset_type end = ptr(r + 1); + matrix_scalar_type A_rr = STS::zero(); + for (offset_type k = beg; k < end; ++k) { const matrix_scalar_type A_rc = val(k); const local_ordinal_type c = ind(k); - for (local_ordinal_type j = 0; j < numVecs; ++j) { - X(r, j) -= A_rc * X(c, j); + if (r == c) + A_rr += A_rc; + else { + for (local_ordinal_type j = 0; j < numVecs; ++j) { + X(r, j) -= A_rc * X(c, j); + } } } // for each entry A_rc in the current row r for (local_ordinal_type j = 0; j < numVecs; ++j) { diff --git a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp index 19bc5ec163..d779ff3e96 100644 --- a/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp +++ b/src/sparse/impl/KokkosSparse_twostage_gauss_seidel_impl.hpp @@ -57,7 +57,7 @@ // needed for classical GS #include "KokkosSparse_sptrsv.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_gauss_seidel_handle.hpp" @@ -854,11 +854,11 @@ class TwostageGaussSeidel { // values // CuSparse needs matrix sorted by column indexes for each row // TODO: may need to move this to symbolic/numeric of sptrsv - KokkosKernels::sort_crs_matrix( + KokkosSparse::sort_crs_matrix( rowmap_viewL, column_viewL, values_viewL); - KokkosKernels::sort_crs_matrix( + KokkosSparse::sort_crs_matrix( rowmap_viewU, column_viewU, values_viewU); // now do symbolic diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 2878543f33..976da2c358 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -45,7 +45,10 @@ #ifndef KOKKOSKERNELS_TEST_UTILS_HPP #define KOKKOSKERNELS_TEST_UTILS_HPP +#include + #include "KokkosKernels_Utils.hpp" +#include "KokkosKernels_IOUtils.hpp" #include "Kokkos_ArithTraits.hpp" #include "KokkosSparse_spmv.hpp" // Make this include-able from all subdirectories @@ -338,6 +341,15 @@ class epsilon { }; #endif // KOKKOS_HALF_T_IS_FLOAT +// explicit epsilon specializations +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +template <> +class epsilon { + public: + constexpr static double value = 0.0078125F; +}; +#endif // KOKKOS_HALF_T_IS_FLOAT + using KokkosKernels::Impl::getRandomBounds; template +class RandCscMat { + private: + using ValViewTypeD = Kokkos::View; + using RowIdViewTypeD = Kokkos::View; + using ColMapViewTypeD = Kokkos::View; + int64_t __nrows; + int64_t __ncols; + int64_t __nnz = 0; + ColMapViewTypeD __col_map_d; + RowIdViewTypeD __row_ids_d; + ValViewTypeD __vals_d; + using ColMapViewTypeH = typename ColMapViewTypeD::HostMirror; + using RowIdViewTypeH = typename RowIdViewTypeD::HostMirror; + using ValViewTypeH = typename ValViewTypeD::HostMirror; + ColMapViewTypeH __col_map; + RowIdViewTypeH __row_ids; + ValViewTypeH __vals; + bool __fully_sparse; + + /// Generates a random column map where: + /// 1. __col_map(i) is in [__row_ids.data(), &row_ids.data()[nnz - 1] + /// 2. __col_map(i) > col_map(i - 1) for i > 1 + /// 3. __col_map(i) == col_map(j) iff __col_map(i) == col_map(j) == nullptr + /// 4. __col_map(i) - col_map(i - 1) is in [0, m] + void __populate_random_csc_mat(uint64_t ticks) { + std::srand(ticks); + for (int64_t col_idx = 0; col_idx < __ncols; col_idx++) { + int64_t r = std::rand() % (__nrows + 1); + if (r == 0 || __fully_sparse) { // 100% sparse column + __col_map(col_idx) = __nnz; + } else { // sparse column with r elements + // Populate r row ids + std::vector v(r); + + for (int64_t i = 0; i < r; i++) v.at(i) = i; + + std::shuffle(v.begin(), v.end(), std::mt19937(std::random_device()())); + + for (int64_t i = 0; i < r; i++) __row_ids(i + __nnz) = v.at(i); + + // Point to new column and accumulate number of non zeros + __col_map(col_idx) = __nnz; + __nnz += r; + } + } + + // last entry in map points to end of row id list + __col_map(__ncols) = __nnz; + + // Copy to device + Kokkos::deep_copy(__col_map_d, __col_map); + Kokkos::deep_copy(__row_ids_d, __row_ids); + ExeSpaceType().fence(); + } + + template + T __getter_copy_helper(T src) { + T dst(std::string("RandCscMat.") + typeid(T).name() + " copy", + src.extent(0)); + Kokkos::deep_copy(dst, src); + ExeSpaceType().fence(); + return dst; + } + + public: + std::string info; + /// Constructs a random csc matrix. + /// \param m The number of rows. + /// \param n The number of columns. + /// \param min_val The minimum scalar value in the matrix. + /// \param max_val The maximum scalar value in the matrix. + RandCscMat(int64_t m, int64_t n, ScalarType min_val, ScalarType max_val, + bool fully_sparse = false) { + __ncols = n; + __nrows = m; + __fully_sparse = fully_sparse; + __col_map_d = ColMapViewTypeD("RandCscMat.ColMapViewType", __ncols + 1); + __col_map = Kokkos::create_mirror_view(__col_map_d); + __row_ids_d = RowIdViewTypeD("RandCscMat.RowIdViewType", + m * n + 1); // over-allocated + __row_ids = Kokkos::create_mirror_view(__row_ids_d); + + uint64_t ticks = + std::chrono::high_resolution_clock::now().time_since_epoch().count() % + UINT32_MAX; + + info = std::string( + std::string("RandCscMat<") + typeid(ScalarType).name() + ", " + + typeid(LayoutType).name() + ", " + typeid(ExeSpaceType).name() + ">(" + + std::to_string(m) + ", " + std::to_string(n) + + "...): rand seed: " + std::to_string(ticks) + + ", fully sparse: " + (__fully_sparse ? "true" : "false") + "\n"); + Kokkos::Random_XorShift64_Pool random(ticks); + __populate_random_csc_mat(ticks); + + __vals_d = ValViewTypeD("RandCscMat.ValViewType", __nnz + 1); + __vals = Kokkos::create_mirror_view(__vals_d); + Kokkos::fill_random(__vals, random, min_val, max_val); // random scalars + Kokkos::fence(); + __vals(__nnz) = ScalarType(0); + + // Copy to device + Kokkos::deep_copy(__vals_d, __vals); + ExeSpaceType().fence(); + } + + // O(c), where c is a constant. + ScalarType operator()(int64_t idx) { return __vals(idx); } + + int64_t get_nnz() { return __nnz; } + int64_t get_m() { return __nrows; } + int64_t get_n() { return __ncols; } + int64_t get_col_len(int64_t j) { + return j < __ncols ? (__col_map(j + 1) - __col_map(j)) : 0; + } + int64_t get_col_start(int64_t j) { return j < __ncols ? __col_map(j) : 0; } + ValViewTypeD get_vals() { return __getter_copy_helper(__vals_d); } + RowIdViewTypeD get_row_ids() { return __getter_copy_helper(__row_ids_d); } + ColMapViewTypeD get_col_map() { return __getter_copy_helper(__col_map_d); } +}; + +/// \brief Randomly shuffle the entries in each row (col) of a Crs (Ccs) matrix. +template +void shuffleMatrixEntries(Rowptrs rowptrs, Entries entries, Values values) { + using size_type = typename Rowptrs::non_const_value_type; + using ordinal_type = typename Entries::value_type; + auto rowptrsHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), rowptrs); + auto entriesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), entries); + auto valuesHost = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), values); + ordinal_type numRows = + rowptrsHost.extent(0) ? (rowptrsHost.extent(0) - 1) : 0; + for (ordinal_type i = 0; i < numRows; i++) { + size_type rowBegin = rowptrsHost(i); + size_type rowEnd = rowptrsHost(i + 1); + for (size_type j = rowBegin; j < rowEnd - 1; j++) { + ordinal_type swapRange = rowEnd - j; + size_type swapOffset = j + (rand() % swapRange); + std::swap(entriesHost(j), entriesHost(swapOffset)); + std::swap(valuesHost(j), valuesHost(swapOffset)); + } + } + Kokkos::deep_copy(entries, entriesHost); + Kokkos::deep_copy(values, valuesHost); +} + } // namespace Test #endif diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp index 47a1cf1fd4..edf573c633 100644 --- a/unit_test/batched/dense/Test_Batched_Dense.hpp +++ b/unit_test/batched/dense/Test_Batched_Dense.hpp @@ -16,15 +16,14 @@ #include "Test_Batched_SerialGemv.hpp" #include "Test_Batched_SerialGemv_Real.hpp" #include "Test_Batched_SerialGemv_Complex.hpp" +#include "Test_Batched_SerialGesv.hpp" +#include "Test_Batched_SerialGesv_Real.hpp" #include "Test_Batched_SerialInverseLU.hpp" #include "Test_Batched_SerialInverseLU_Real.hpp" #include "Test_Batched_SerialInverseLU_Complex.hpp" #include "Test_Batched_SerialLU.hpp" #include "Test_Batched_SerialLU_Real.hpp" #include "Test_Batched_SerialLU_Complex.hpp" -#include "Test_Batched_SerialMatUtil.hpp" -#include "Test_Batched_SerialMatUtil_Real.hpp" -#include "Test_Batched_SerialMatUtil_Complex.hpp" #include "Test_Batched_SerialSolveLU.hpp" #include "Test_Batched_SerialSolveLU_Real.hpp" #include "Test_Batched_SerialSolveLU_Complex.hpp" @@ -52,15 +51,14 @@ #include "Test_Batched_TeamGemv.hpp" #include "Test_Batched_TeamGemv_Real.hpp" #include "Test_Batched_TeamGemv_Complex.hpp" +#include "Test_Batched_TeamGesv.hpp" +#include "Test_Batched_TeamGesv_Real.hpp" #include "Test_Batched_TeamInverseLU.hpp" #include "Test_Batched_TeamInverseLU_Real.hpp" #include "Test_Batched_TeamInverseLU_Complex.hpp" #include "Test_Batched_TeamLU.hpp" #include "Test_Batched_TeamLU_Real.hpp" #include "Test_Batched_TeamLU_Complex.hpp" -#include "Test_Batched_TeamMatUtil.hpp" -#include "Test_Batched_TeamMatUtil_Real.hpp" -#include "Test_Batched_TeamMatUtil_Complex.hpp" #include "Test_Batched_TeamSolveLU.hpp" #include "Test_Batched_TeamSolveLU_Real.hpp" #include "Test_Batched_TeamSolveLU_Complex.hpp" @@ -80,6 +78,8 @@ #include "Test_Batched_TeamVectorGemm.hpp" #include "Test_Batched_TeamVectorGemm_Real.hpp" #include "Test_Batched_TeamVectorGemm_Complex.hpp" +#include "Test_Batched_TeamVectorGesv.hpp" +#include "Test_Batched_TeamVectorGesv_Real.hpp" #include "Test_Batched_TeamVectorQR.hpp" #include "Test_Batched_TeamVectorQR_Real.hpp" #include "Test_Batched_TeamVectorQR_WithColumnPivoting.hpp" diff --git a/unit_test/batched/dense/Test_Batched_DenseUtils.hpp b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp new file mode 100644 index 0000000000..d355159a9a --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_DenseUtils.hpp @@ -0,0 +1,44 @@ +#ifndef TEST_BATCHED_DENSE_HELPER_HPP +#define TEST_BATCHED_DENSE_HELPER_HPP + +namespace KokkosBatched { +template +void create_tridiagonal_batched_matrices(const MatrixViewType &A, + const VectorViewType &B) { + Kokkos::Random_XorShift64_Pool< + typename VectorViewType::device_type::execution_space> + random(13718); + Kokkos::fill_random( + B, random, + Kokkos::reduction_identity::prod()); + + auto A_host = Kokkos::create_mirror_view(A); + + const int N = A.extent(0); + const int BlkSize = A.extent(1); + + for (int l = 0; l < N; ++l) { + for (int i = 0; i < BlkSize; ++i) { + for (int j = i; j < BlkSize; ++j) { + if (i == j) + A_host(l, i, j) = typename VectorViewType::value_type(2.0); + else if (i == j - 1) { + A_host(l, i, j) = typename VectorViewType::value_type(-1.0); + A_host(l, j, i) = typename VectorViewType::value_type(-1.0); + } else { + A_host(l, i, j) = typename VectorViewType::value_type(0.0); + A_host(l, j, i) = typename VectorViewType::value_type(0.0); + } + } + } + } + + Kokkos::fence(); + + Kokkos::deep_copy(A, A_host); + + Kokkos::fence(); +} +} // namespace KokkosBatched + +#endif // TEST_BATCHED_DENSE_HELPER_HPP diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp new file mode 100644 index 0000000000..233d6bedf3 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialGesv.hpp @@ -0,0 +1,141 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace Gesv { + +template +struct Functor_TestBatchedSerialGesv { + const MatrixType _A; + const MatrixType _tmp; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedSerialGesv(const MatrixType &A, const MatrixType &tmp, + const VectorType &X, const VectorType &B) + : _A(A), _tmp(tmp), _X(X), _B(B) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int k) const { + auto A = Kokkos::subview(_A, k, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, k, Kokkos::ALL); + auto b = Kokkos::subview(_B, k, Kokkos::ALL); + auto tmp = Kokkos::subview(_tmp, k, Kokkos::ALL, Kokkos::ALL); + + KokkosBatched::SerialGesv::invoke(A, x, b, tmp); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _X.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize), + tmp("tmp", N, BlkSize, BlkSize + 4); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedSerialGesv(A, tmp, X, B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace Gesv +} // namespace Test + +template +int test_batched_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::Gesv::impl_test_batched_gesv(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::Gesv::impl_test_batched_gesv(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp new file mode 100644 index 0000000000..84a630efa3 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_SerialGesv_Real.hpp @@ -0,0 +1,19 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_float) { + test_batched_gesv(); +} +TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_float) { + test_batched_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_serial_gesv_static_pivoting_double) { + test_batched_gesv(); +} +TEST_F(TestCategory, batched_scalar_serial_gesv_no_pivoting_double) { + test_batched_gesv(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp deleted file mode 100644 index f9a58f5442..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil.hpp +++ /dev/null @@ -1,165 +0,0 @@ -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "gtest/gtest.h" -#include "Kokkos_Core.hpp" -#include "Kokkos_Random.hpp" - -#include "KokkosBatched_Set_Decl.hpp" -#include "KokkosBatched_Set_Impl.hpp" - -#include "KokkosBatched_Scale_Decl.hpp" -#include "KokkosBatched_Scale_Impl.hpp" - -#include "KokkosKernels_TestUtils.hpp" - -using namespace KokkosBatched; - -namespace Test { - -enum : int { BatchedSet = 0, BatchedScale = 1 }; - -struct KokkosKernelTag {}; -struct NaiveTag {}; - -template -struct Functor_TestBatchedSerialMatUtil { - ScalarType _alpha; - ViewType _a; - - KOKKOS_INLINE_FUNCTION - Functor_TestBatchedSerialMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const KokkosKernelTag &, const int i) const { - auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); - switch (TestID) { - case BatchedSet: SerialSet ::invoke(_alpha, A); break; - case BatchedScale: SerialScale::invoke(_alpha, A); break; - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const NaiveTag &, const int k) const { - // MD Note: changing because of the error with -werror - auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - const int m = A.extent(0), n = A.extent(1); - switch (TestID) { - case BatchedSet: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) = _alpha; - break; - } - case BatchedScale: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) *= _alpha; - break; - } - } - } - - inline int run() { - typedef typename ViewType::value_type value_type; - std::string name_region("KokkosBatched::Test::SerialMatUtil"); - const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBatched" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BatchedSet - ? "Set" - : TestID == BatchedScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; - Kokkos::Profiling::pushRegion(name.c_str()); - Kokkos::RangePolicy policy(0, _a.extent(0)); - Kokkos::parallel_for(name.c_str(), policy, *this); - Kokkos::Profiling::popRegion(); - return 0; - } -}; - -template -void impl_test_batched_matutil(const int N, const int BlkSize) { - /// typedefs - typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; - - /// radomized input testing views - const ScalarType alpha = 11.1; - ViewType a("a", N, BlkSize, BlkSize); - ViewType b("b", N, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random( - 13718); - Kokkos::fill_random(a, random, value_type(1.0)); - - Kokkos::fence(); - - Kokkos::deep_copy(b, a); - - /// test body - Functor_TestBatchedSerialMatUtil(alpha, a) - .run(); - Functor_TestBatchedSerialMatUtil(alpha, b) - .run(); - - Kokkos::fence(); - - /// for comparison send it to host - typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); - typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); - - Kokkos::deep_copy(a_host, a); - Kokkos::deep_copy(b_host, b); - - /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); -} -} // namespace Test - -template -int test_batched_matutil() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - { - typedef Kokkos::View - ViewType; - Test::impl_test_batched_matutil( - 0, 10); - Test::impl_test_batched_matutil( - 10, 15); - Test::impl_test_batched_matutil( - 1024, 9); - Test::impl_test_batched_matutil( - 132231, 3); - } -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - { - typedef Kokkos::View - ViewType; - Test::impl_test_batched_matutil( - 0, 10); - Test::impl_test_batched_matutil( - 10, 15); - Test::impl_test_batched_matutil( - 1024, 9); - Test::impl_test_batched_matutil( - 132231, 3); - } -#endif - - return 0; -} diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp deleted file mode 100644 index 055a0cae62..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Complex.hpp +++ /dev/null @@ -1,19 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_dcomplex) { - test_batched_matutil, - Kokkos::complex, ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_dcomplex) { - test_batched_matutil, - Kokkos::complex, ::Test::BatchedScale>(); -} -TEST_F(TestCategory, batched_scalar_serial_set_dcomplex_double) { - test_batched_matutil, double, - ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_dcomplex_double) { - test_batched_matutil, double, - ::Test::BatchedScale>(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp deleted file mode 100644 index c1644f9798..0000000000 --- a/unit_test/batched/dense/Test_Batched_SerialMatUtil_Real.hpp +++ /dev/null @@ -1,18 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_serial_set_float_float) { - test_batched_matutil(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_float_float) { - test_batched_matutil(); -} -#endif - -#if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_serial_set_double_double) { - test_batched_matutil(); -} -TEST_F(TestCategory, batched_scalar_serial_scale_double_double) { - test_batched_matutil(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp index 57ec7f645b..d30da1726c 100644 --- a/unit_test/batched/dense/Test_Batched_SerialSVD.hpp +++ b/unit_test/batched/dense/Test_Batched_SerialSVD.hpp @@ -31,7 +31,7 @@ double simpleNorm2(const Vector& v) { double m = KAT::abs(vhost(i)); d += m * m; } - return Kokkos::Experimental::sqrt(d); + return std::sqrt(d); } template diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp index cdcd00cff2..3ffc34db23 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp @@ -5,19 +5,21 @@ TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_team_gemv, + Kokkos::complex, param_tag_type, + algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, - Kokkos::complex, param_tag_type, algo_tag_type>(); + test_batched_team_gemv, + Kokkos::complex, param_tag_type, + algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_dcomplex ) { // typedef ::Test::TeamGemv::ParamTag param_tag_type; // typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_gemv,Kokkos::complex,param_tag_type,algo_tag_type>(); +// test_batched_team_gemv,Kokkos::complex,param_tag_type,algo_tag_type>(); // } /// dcomplex, double @@ -25,19 +27,19 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) { TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, double, - param_tag_type, algo_tag_type>(); + test_batched_team_gemv, double, + param_tag_type, algo_tag_type>(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv, double, - param_tag_type, algo_tag_type>(); + test_batched_team_gemv, double, + param_tag_type, algo_tag_type>(); } // TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_double ) { // typedef ::Test::TeamGemv::ParamTag param_tag_type; // typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_gemv,double,param_tag_type,algo_tag_type>(); +// test_batched_team_gemv,double,param_tag_type,algo_tag_type>(); // } #endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp index 8401075f47..2c4db11b2d 100644 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp @@ -3,14 +3,14 @@ TEST_F(TestCategory, batched_scalar_team_gemv_nt_float_float) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } #endif @@ -18,13 +18,13 @@ TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) { TEST_F(TestCategory, batched_scalar_team_gemv_nt_double_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } TEST_F(TestCategory, batched_scalar_team_gemv_t_double_double) { typedef ::Test::TeamGemv::ParamTag param_tag_type; typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_gemv(); + test_batched_team_gemv(); } #endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp new file mode 100644 index 0000000000..8f6bcf9f9d --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamGesv.hpp @@ -0,0 +1,152 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamGesv { + +template +struct Functor_TestBatchedTeamGesv { + const MatrixType _A; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamGesv(const MatrixType &A, const VectorType &X, + const VectorType &B) + : _A(A), _X(X), _B(B) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + + member.team_barrier(); + KokkosBatched::TeamGesv::invoke(member, A, x, b); + member.team_barrier(); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedTeamGesv( + A, X, B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace TeamGesv +} // namespace Test + +template +int test_batched_team_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamGesv::impl_test_batched_gesv(1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamGesv::impl_test_batched_gesv(1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp new file mode 100644 index 0000000000..8dca15a4a2 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamGesv_Real.hpp @@ -0,0 +1,21 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_float) { + test_batched_team_gesv(); +} +TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_float) { + test_batched_team_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_team_gesv_static_pivoting_double) { + test_batched_team_gesv(); +} +TEST_F(TestCategory, batched_scalar_team_gesv_no_pivoting_double) { + test_batched_team_gesv(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp deleted file mode 100644 index 16879444f7..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil.hpp +++ /dev/null @@ -1,178 +0,0 @@ -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "gtest/gtest.h" -#include "Kokkos_Core.hpp" -#include "Kokkos_Random.hpp" - -#include "KokkosBatched_Set_Decl.hpp" -#include "KokkosBatched_Set_Impl.hpp" - -#include "KokkosBatched_Scale_Decl.hpp" -#include "KokkosBatched_Scale_Impl.hpp" - -#include "KokkosKernels_TestUtils.hpp" - -using namespace KokkosBatched; - -namespace Test { -namespace TeamMatUtil { - -enum : int { BatchedSet = 0, BatchedScale = 1 }; - -struct KokkosKernelTag {}; -struct NaiveTag {}; - -template -struct Functor_TestBatchedTeamMatUtil { - ScalarType _alpha; - ViewType _a; - - KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamMatUtil(const ScalarType alpha, const ViewType &a) - : _alpha(alpha), _a(a) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, - const MemberType &member) const { - const int i = member.league_rank(); - auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); - switch (TestID) { - case BatchedSet: TeamSet::invoke(member, _alpha, A); break; - case BatchedScale: - TeamScale::invoke(member, _alpha, A); - break; - } - } - - template - KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, - const MemberType &member) const { - if (member.team_rank() == 0) { - const int k = member.league_rank(); - auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - const int m = A.extent(0), n = A.extent(1); - switch (TestID) { - case BatchedSet: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) = _alpha; - break; - } - case BatchedScale: { - for (int i = 0; i < m; ++i) - for (int j = 0; j < n; ++j) A(i, j) *= _alpha; - break; - } - } - } - } - - inline int run() { - typedef typename ViewType::value_type value_type; - std::string name_region("KokkosBatched::Test::SerialMatUtil"); - const std::string name_value_type = Test::value_type_name(); - std::string name_work_tag = - (std::is_same::value - ? "::KokkosBatched" - : std::is_same::value ? "::Naive" - : "::UnknownWorkTag"); - std::string name_test_id = - (TestID == BatchedSet - ? "Set" - : TestID == BatchedScale ? "Scale" : "UnknownTest"); - std::string name = - name_region + name_value_type + name_work_tag + name_test_id; - Kokkos::Profiling::pushRegion(name.c_str()); - - const int league_size = _a.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); - Kokkos::parallel_for(name.c_str(), policy, *this); - Kokkos::Profiling::popRegion(); - - return 0; - } -}; - -template -void impl_test_batched_matutil(const int N, const int BlkSize) { - /// typedefs - typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; - - /// radomized input testing views - const ScalarType alpha = 11.1; - ViewType a("a", N, BlkSize, BlkSize); - ViewType b("b", N, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random( - 13718); - Kokkos::fill_random(a, random, value_type(1.0)); - - Kokkos::fence(); - - Kokkos::deep_copy(b, a); - - /// test body - Functor_TestBatchedTeamMatUtil(alpha, a) - .run(); - Functor_TestBatchedTeamMatUtil(alpha, b) - .run(); - - Kokkos::fence(); - - /// for comparison send it to host - typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); - typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); - - Kokkos::deep_copy(a_host, a); - Kokkos::deep_copy(b_host, b); - - /// check a = b - typename ats::mag_type eps = - 100 * std::numeric_limits::epsilon(); - for (int k = 0; k < N; ++k) - for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < BlkSize; ++j) - EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); -} -} // namespace TeamMatUtil -} // namespace Test - -template -int test_batched_team_matutil() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_batched_matutil(0, 10); - Test::TeamMatUtil::impl_test_batched_matutil(10, 15); - Test::TeamMatUtil::impl_test_batched_matutil(1024, 9); - Test::TeamMatUtil::impl_test_batched_matutil(132231, 3); - } -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - { - typedef Kokkos::View - ViewType; - Test::TeamMatUtil::impl_test_batched_matutil(0, 10); - Test::TeamMatUtil::impl_test_batched_matutil(10, 15); - Test::TeamMatUtil::impl_test_batched_matutil(1024, 9); - Test::TeamMatUtil::impl_test_batched_matutil(132231, 3); - } -#endif - - return 0; -} diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp deleted file mode 100644 index 7f573354d8..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Complex.hpp +++ /dev/null @@ -1,19 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_set_dcomplex_dcomplex) { - test_batched_team_matutil, - Kokkos::complex, ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_dcomplex) { - test_batched_team_matutil, - Kokkos::complex, ::Test::BatchedScale>(); -} -TEST_F(TestCategory, batched_scalar_team_set_dcomplex_double) { - test_batched_team_matutil, double, - ::Test::BatchedSet>(); -} -TEST_F(TestCategory, batched_scalar_team_scale_dcomplex_double) { - test_batched_team_matutil, double, - ::Test::BatchedScale>(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp deleted file mode 100644 index 1f13b79cca..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamMatUtil_Real.hpp +++ /dev/null @@ -1,21 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_set_float_float) { - test_batched_team_matutil(); -} -TEST_F(TestCategory, batched_scalar_team_scale_float_float) { - test_batched_team_matutil(); -} -#endif - -#if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_set_double_double) { - test_batched_team_matutil(); -} -TEST_F(TestCategory, batched_scalar_team_scale_double_double) { - test_batched_team_matutil(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp new file mode 100644 index 0000000000..9ee05cb919 --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv.hpp @@ -0,0 +1,155 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBatched_Gesv.hpp" +#include "KokkosBatched_Dot.hpp" +#include "KokkosBatched_Gemv_Decl.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +#include "Test_Batched_DenseUtils.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace TeamVectorGesv { + +template +struct Functor_TestBatchedTeamVectorGesv { + const MatrixType _A; + const VectorType _X; + const VectorType _B; + + KOKKOS_INLINE_FUNCTION + Functor_TestBatchedTeamVectorGesv(const MatrixType &A, const VectorType &X, + const VectorType &B) + : _A(A), _X(X), _B(B) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + const int matrix_id = static_cast(member.league_rank()); + auto A = Kokkos::subview(_A, matrix_id, Kokkos::ALL, Kokkos::ALL); + auto x = Kokkos::subview(_X, matrix_id, Kokkos::ALL); + auto b = Kokkos::subview(_B, matrix_id, Kokkos::ALL); + + member.team_barrier(); + KokkosBatched::TeamVectorGesv::invoke(member, A, x, + b); + member.team_barrier(); + } + + inline void run() { + typedef typename VectorType::value_type value_type; + std::string name_region("KokkosBatched::Test::TeamVectorGesv"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::TeamPolicy policy(_X.extent(0), Kokkos::AUTO(), + Kokkos::AUTO()); + + using MatrixViewType = + Kokkos::View; + + const int n = _A.extent(1); + size_t bytes_0 = MatrixViewType::shmem_size(n, n + 4); + policy.set_scratch_size(0, Kokkos::PerTeam(bytes_0)); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_gesv(const int N, const int BlkSize) { + typedef typename MatrixType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = + Kokkos::View; + + NormViewType sqr_norm_j("sqr_norm_j", N); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + + MatrixType A("A", N, BlkSize, BlkSize), A2("A", N, BlkSize, BlkSize); + VectorType B("b", N, BlkSize), B2("b", N, BlkSize), X("x", N, BlkSize); + + create_tridiagonal_batched_matrices(A, B); + Kokkos::deep_copy(A2, A); + Kokkos::deep_copy(B2, B); + + auto A_host = Kokkos::create_mirror_view(A2); + auto B_host = Kokkos::create_mirror_view(B2); + auto X_host = Kokkos::create_mirror_view(X); + + Kokkos::deep_copy(A_host, A2); + Kokkos::deep_copy(B_host, B2); + + Kokkos::fence(); + + Functor_TestBatchedTeamVectorGesv(A, X, B) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(X_host, X); + + for (int l = 0; l < N; ++l) + KokkosBatched::SerialGemv:: + invoke(-1, Kokkos::subview(A_host, l, Kokkos::ALL, Kokkos::ALL), + Kokkos::subview(X_host, l, Kokkos::ALL), 1, + Kokkos::subview(B_host, l, Kokkos::ALL)); + + KokkosBatched::SerialDot::invoke(B_host, B_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e3 * ats::epsilon(); + + for (int l = 0; l < N; ++l) EXPECT_NEAR_KK(sqr_norm_j_host(l), 0, eps); +} +} // namespace TeamVectorGesv +} // namespace Test + +template +int test_batched_teamvector_gesv() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamVectorGesv::impl_test_batched_gesv( + 1024, i); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + MatrixType; + typedef Kokkos::View + VectorType; + + for (int i = 3; i < 10; ++i) { + Test::TeamVectorGesv::impl_test_batched_gesv( + 1024, i); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp new file mode 100644 index 0000000000..d83706718c --- /dev/null +++ b/unit_test/batched/dense/Test_Batched_TeamVectorGesv_Real.hpp @@ -0,0 +1,21 @@ +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_float) { + test_batched_teamvector_gesv(); +} +TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_float) { + test_batched_teamvector_gesv(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_teamvector_gesv_static_pivoting_double) { + test_batched_teamvector_gesv(); +} +TEST_F(TestCategory, batched_scalar_teamvector_gesv_no_pivoting_double) { + test_batched_teamvector_gesv(); +} +#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp index 4ae4ee4133..80bc7b246a 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp @@ -4,7 +4,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" +#include "KokkosBlas1_set.hpp" #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_Gemv_Decl.hpp" #include "KokkosBatched_Trsv_Decl.hpp" @@ -49,7 +49,7 @@ struct Functor_TestBatchedTeamVectorQR { [&](const int &i) { aa(i, i) += add_this; }); /// xx = 1 - TeamVectorSet::invoke(member, one, xx); + KokkosBlas::TeamVectorSet::invoke(member, one, xx); member.team_barrier(); /// bb = AA*xx diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 3ae24bda84..72754a5e00 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -4,7 +4,6 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Random.hpp" -#include "KokkosBatched_Set_Decl.hpp" #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_ApplyPivot_Decl.hpp" #include "KokkosBatched_Gemv_Decl.hpp" diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp new file mode 100644 index 0000000000..108a984a9d --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES.hpp @@ -0,0 +1,239 @@ +/// \author Kim Liegeois (knliege@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosBatched_GMRES.hpp" +#include "KokkosKernels_TestUtils.hpp" +#include "KokkosBatched_CrsMatrix.hpp" +#include "Test_Batched_SparseUtils.hpp" +#include "KokkosBatched_JacobiPrec.hpp" + +using namespace KokkosBatched; + +namespace Test { +namespace GMRES { + +template +struct Functor_TestBatchedSerialGMRES { + const ValuesViewType _D; + const IntView _r; + const IntView _c; + const VectorViewType _X; + const VectorViewType _B; + const VectorViewType _Diag; + const int _N_team; + KrylovHandleType _handle; + + Functor_TestBatchedSerialGMRES(const ValuesViewType &D, const IntView &r, + const IntView &c, const VectorViewType &X, + const VectorViewType &B, + const VectorViewType &diag, const int N_team, + KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _Diag(diag), + _N_team(N_team), + _handle(handle) {} + + KOKKOS_INLINE_FUNCTION void operator()(const int k) const { + const int first_matrix = _handle.first_index(k); + const int last_matrix = _handle.last_index(k); + + auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto diag = Kokkos::subview( + _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), + Kokkos::ALL); + + using Operator = KokkosBatched::CrsMatrix; + using PrecOperator = KokkosBatched::JacobiPrec; + + Operator A(d, _r, _c); + PrecOperator P(diag); + P.setComputedInverse(); + + KokkosBatched::SerialGMRES::template invoke( + A, b, x, P, _handle, k); + } + + inline void run() { + typedef typename ValuesViewType::value_type value_type; + std::string name_region("KokkosBatched::Test::SerialGMRES"); + const std::string name_value_type = Test::value_type_name(); + std::string name = name_region + name_value_type; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _D.extent(0) / _N_team); + + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); + + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); + _handle.tmp_view = typename KrylovHandleType::TemporaryViewType( + "", N, n + maximum_iteration + 3); + + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + } +}; + +template +void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { + typedef typename ValuesViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + const int nnz = (BlkSize - 2) * 3 + 2 * 2; + + VectorViewType X("x0", N, BlkSize); + VectorViewType R("r0", N, BlkSize); + VectorViewType B("b", N, BlkSize); + ValuesViewType D("D", N, nnz); + ValuesViewType Diag("Diag", N, BlkSize); + IntView r("r", BlkSize + 1); + IntView c("c", nnz); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; + + using MagnitudeType = + typename Kokkos::Details::ArithTraits::mag_type; + using NormViewType = Kokkos::View; + + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + + NormViewType sqr_norm_0("sqr_norm_0", N); + NormViewType sqr_norm_j("sqr_norm_j", N); + + create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B); + + { + auto diag_values_host = Kokkos::create_mirror_view(Diag); + auto values_host = Kokkos::create_mirror_view(D); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, D); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) + diag_values_host(j, i) = values_host(j, current_index); + } + + Kokkos::deep_copy(Diag, diag_values_host); + } + + // Compute initial norm + + Kokkos::deep_copy(R, B); + + auto sqr_norm_0_host = Kokkos::create_mirror_view(sqr_norm_0); + auto sqr_norm_j_host = Kokkos::create_mirror_view(sqr_norm_j); + auto R_host = Kokkos::create_mirror_view(R); + auto X_host = Kokkos::create_mirror_view(X); + auto D_host = Kokkos::create_mirror_view(D); + auto r_host = Kokkos::create_mirror_view(r); + auto c_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(R, B); + Kokkos::deep_copy(R_host, R); + Kokkos::deep_copy(X_host, X); + + Kokkos::deep_copy(c_host, c); + Kokkos::deep_copy(r_host, r); + Kokkos::deep_copy(D_host, D); + + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + + KokkosBatched::SerialSpmv::template invoke< + typename ValuesViewType::HostMirror, typename IntView::HostMirror, + typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, + 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, + sqr_norm_0_host); + Functor_TestBatchedSerialGMRES( + D, r, c, X, B, Diag, N_team, handle) + .run(); + + Kokkos::fence(); + + Kokkos::deep_copy(R, B); + Kokkos::deep_copy(R_host, R); + Kokkos::deep_copy(X_host, X); + + KokkosBatched::SerialSpmv::template invoke< + typename ValuesViewType::HostMirror, typename IntView::HostMirror, + typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, + 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); + KokkosBatched::SerialDot::invoke(R_host, R_host, + sqr_norm_j_host); + + const MagnitudeType eps = 1.0e5 * ats::epsilon(); + + for (int l = 0; l < N; ++l) + EXPECT_NEAR_KK( + std::sqrt(sqr_norm_j_host(l)) / std::sqrt(sqr_norm_0_host(l)), 0, eps); +} +} // namespace GMRES +} // namespace Test + +template +int test_batched_serial_GMRES() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View + VectorViewType; + + for (int i = 3; i < 10; ++i) { + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + } + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + typedef Kokkos::View IntView; + typedef Kokkos::View + VectorViewType; + + for (int i = 3; i < 10; ++i) { + Test::GMRES::impl_test_batched_GMRES(1024, i, 2); + } + } +#endif + + return 0; +} diff --git a/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp new file mode 100644 index 0000000000..acaa2f0ed2 --- /dev/null +++ b/unit_test/batched/sparse/Test_Batched_SerialGMRES_Real.hpp @@ -0,0 +1,12 @@ + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, batched_scalar_serial_GMRES_float) { + test_batched_serial_GMRES(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, batched_scalar_serial_GMRES_double) { + test_batched_serial_GMRES(); +} +#endif diff --git a/unit_test/batched/sparse/Test_Batched_Sparse.hpp b/unit_test/batched/sparse/Test_Batched_Sparse.hpp index 4b36400d2e..36bfc43528 100644 --- a/unit_test/batched/sparse/Test_Batched_Sparse.hpp +++ b/unit_test/batched/sparse/Test_Batched_Sparse.hpp @@ -2,6 +2,8 @@ #define TEST_BATCHED_SPARSE_HPP // Serial kernels +#include "Test_Batched_SerialGMRES.hpp" +#include "Test_Batched_SerialGMRES_Real.hpp" #include "Test_Batched_SerialSpmv.hpp" #include "Test_Batched_SerialSpmv_Real.hpp" diff --git a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp index 3e606d1508..8cfc76410b 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamCG.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamCG.hpp @@ -14,7 +14,7 @@ namespace Test { namespace TeamCG { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamCG { const ValuesViewType _D; const IntView _r; @@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamCG { const VectorViewType _X; const VectorViewType _B; const int _N_team; - KrylovHandle handle; + KrylovHandleType handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -50,9 +55,7 @@ struct Functor_TestBatchedTeamCG { Operator A(d, _r, _c); - KokkosBatched::TeamCG::template invoke( - member, A, b, x, handle); + KokkosBatched::TeamCG::invoke(member, A, b, x, handle); } inline void run() { @@ -96,6 +99,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -127,8 +137,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { 1>(-1, D_host, r_host, c_host, X_host, 1, R_host); KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); - Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) + Functor_TestBatchedTeamCG(D, r, c, X, B, N_team) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp index f724553590..553d4d3419 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamGMRES.hpp @@ -15,21 +15,30 @@ namespace Test { namespace TeamGMRES { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamGMRES { const ValuesViewType _D; const IntView _r; const IntView _c; const VectorViewType _X; const VectorViewType _B; + const VectorViewType _Diag; const int _N_team; - KrylovHandle handle; + KrylovHandleType _handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, - const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + const VectorViewType &B, + const VectorViewType &diag, const int N_team, + KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _Diag(diag), + _N_team(N_team), + _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -42,18 +51,23 @@ struct Functor_TestBatchedTeamGMRES { auto d = Kokkos::subview(_D, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); + auto diag = Kokkos::subview( + _Diag, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto x = Kokkos::subview(_X, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); auto b = Kokkos::subview(_B, Kokkos::make_pair(first_matrix, last_matrix), Kokkos::ALL); - using Operator = KokkosBatched::CrsMatrix; + using Operator = KokkosBatched::CrsMatrix; + using PrecOperator = KokkosBatched::JacobiPrec; Operator A(d, _r, _c); + PrecOperator P(diag); + P.setComputedInverse(); KokkosBatched::TeamGMRES::template invoke( - member, A, b, x, handle); + member, A, b, x, P, _handle); } inline void run() { @@ -63,20 +77,37 @@ struct Functor_TestBatchedTeamGMRES { std::string name = name_region + name_value_type; Kokkos::Profiling::pushRegion(name.c_str()); Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, - Kokkos::AUTO()); + Kokkos::AUTO(), Kokkos::AUTO()); + + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); + + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; - handle.set_max_iteration(10); + using ViewType2D = Kokkos::View; - int maximum_iteration = handle.get_max_iteration(); + size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); - policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; policy.set_scratch_size( - 1, Kokkos::PerTeam(maximum_iteration * bytes_0 + - ((maximum_iteration + 3) * maximum_iteration) * - bytes_1)); + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -95,6 +126,7 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { VectorViewType R("r0", N, BlkSize); VectorViewType B("b", N, BlkSize); ValuesViewType D("D", N, nnz); + ValuesViewType Diag("Diag", N, BlkSize); IntView r("r", BlkSize + 1); IntView c("c", nnz); @@ -106,11 +138,41 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); create_tridiagonal_batched_matrices(nnz, BlkSize, N, r, c, D, X, B); + { + auto diag_values_host = Kokkos::create_mirror_view(Diag); + auto values_host = Kokkos::create_mirror_view(D); + auto row_ptr_host = Kokkos::create_mirror_view(r); + auto colIndices_host = Kokkos::create_mirror_view(c); + + Kokkos::deep_copy(values_host, D); + Kokkos::deep_copy(row_ptr_host, r); + Kokkos::deep_copy(colIndices_host, c); + + int current_index; + for (int i = 0; i < BlkSize; ++i) { + for (current_index = row_ptr_host(i); current_index < row_ptr_host(i + 1); + ++current_index) { + if (colIndices_host(current_index) == i) break; + } + for (int j = 0; j < N; ++j) + diag_values_host(j, i) = values_host(j, current_index); + } + + Kokkos::deep_copy(Diag, diag_values_host); + } + // Compute initial norm Kokkos::deep_copy(R, B); @@ -131,6 +193,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(r_host, r); Kokkos::deep_copy(D_host, D); + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + KokkosBatched::SerialSpmv::template invoke< typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, @@ -138,7 +203,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamGMRES(D, r, c, X, B, N_team) + VectorViewType, KrylovHandleType>( + D, r, c, X, B, Diag, N_team, handle) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp index 6637d9858d..d9fb350726 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorCG.hpp @@ -14,7 +14,7 @@ namespace Test { namespace TeamVectorCG { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamVectorCG { const ValuesViewType _D; const IntView _r; @@ -22,13 +22,18 @@ struct Functor_TestBatchedTeamVectorCG { const VectorViewType _X; const VectorViewType _B; const int _N_team; - KrylovHandle handle; + KrylovHandleType handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorCG(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _N_team(N_team) {} + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _N_team(N_team), + handle(KrylovHandleType(_D.extent(0), _N_team)) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -96,6 +101,13 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -128,7 +140,8 @@ void impl_test_batched_CG(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamVectorCG(D, r, c, X, B, N_team) + VectorViewType, KrylovHandleType>(D, r, c, X, + B, N_team) .run(); Kokkos::fence(); diff --git a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp index 87e9da0281..17f72c8963 100644 --- a/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp +++ b/unit_test/batched/sparse/Test_Batched_TeamVectorGMRES.hpp @@ -15,7 +15,7 @@ namespace Test { namespace TeamVectorGMRES { template + typename VectorViewType, typename KrylovHandleType> struct Functor_TestBatchedTeamVectorGMRES { const ValuesViewType _D; const IntView _r; @@ -24,15 +24,21 @@ struct Functor_TestBatchedTeamVectorGMRES { const VectorViewType _B; const VectorViewType _Diag; const int _N_team; - KrylovHandle handle; + KrylovHandleType _handle; - KOKKOS_INLINE_FUNCTION Functor_TestBatchedTeamVectorGMRES(const ValuesViewType &D, const IntView &r, const IntView &c, const VectorViewType &X, const VectorViewType &B, const VectorViewType &diag, - const int N_team) - : _D(D), _r(r), _c(c), _X(X), _B(B), _Diag(diag), _N_team(N_team) {} + const int N_team, KrylovHandleType &handle) + : _D(D), + _r(r), + _c(c), + _X(X), + _B(B), + _Diag(diag), + _N_team(N_team), + _handle(handle) {} template KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { @@ -57,10 +63,11 @@ struct Functor_TestBatchedTeamVectorGMRES { Operator A(d, _r, _c); PrecOperator P(diag); + P.setComputedInverse(); KokkosBatched::TeamVectorGMRES::template invoke( - member, A, b, x, P, handle); + member, A, b, x, P, _handle); } inline void run() { @@ -72,18 +79,35 @@ struct Functor_TestBatchedTeamVectorGMRES { Kokkos::TeamPolicy policy(_D.extent(0) / _N_team, Kokkos::AUTO(), Kokkos::AUTO()); - size_t bytes_0 = ValuesViewType::shmem_size(_N_team, _X.extent(1)); - size_t bytes_1 = ValuesViewType::shmem_size(_N_team, 1); + const int N = _D.extent(0); + const int n = _X.extent(1); + const int maximum_iteration = _handle.get_max_iteration(); + + _handle.set_ortho_strategy(0); + _handle.set_compute_last_residual(false); + _handle.set_tolerance(1e-8); + + _handle.Arnoldi_view = typename KrylovHandleType::ArnoldiViewType( + "", N, maximum_iteration, n + maximum_iteration + 3); + + using ScalarType = typename ValuesViewType::non_const_value_type; + using Layout = typename ValuesViewType::array_layout; + using EXSP = typename ValuesViewType::execution_space; - handle.set_max_iteration(10); + using ViewType2D = Kokkos::View; - int maximum_iteration = handle.get_max_iteration(); + size_t bytes_1D = ViewType2D::shmem_size(_N_team, 1); + size_t bytes_2D_1 = ViewType2D::shmem_size(_N_team, _X.extent(1)); + size_t bytes_2D_2 = ViewType2D::shmem_size(_N_team, maximum_iteration + 1); - policy.set_scratch_size(0, Kokkos::PerTeam(5 * bytes_0 + 5 * bytes_1)); + size_t bytes_row_ptr = IntView::shmem_size(_r.extent(0)); + size_t bytes_col_idc = IntView::shmem_size(_c.extent(0)); + + size_t bytes_int = bytes_row_ptr + bytes_col_idc; + size_t bytes_diag = bytes_2D_1; + size_t bytes_tmp = 2 * bytes_2D_1 + 2 * bytes_1D + bytes_2D_2; policy.set_scratch_size( - 1, Kokkos::PerTeam(maximum_iteration * bytes_0 + - ((maximum_iteration + 3) * maximum_iteration) * - bytes_1)); + 0, Kokkos::PerTeam(bytes_tmp + bytes_diag + bytes_int)); Kokkos::parallel_for(name.c_str(), policy, *this); Kokkos::Profiling::popRegion(); @@ -114,6 +138,13 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { typename Kokkos::Details::ArithTraits::mag_type; using NormViewType = Kokkos::View; + using Norm2DViewType = Kokkos::View; + using Scalar3DViewType = Kokkos::View; + using IntViewType = Kokkos::View; + + using KrylovHandleType = + KrylovHandle; + NormViewType sqr_norm_0("sqr_norm_0", N); NormViewType sqr_norm_j("sqr_norm_j", N); @@ -162,6 +193,9 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { Kokkos::deep_copy(r_host, r); Kokkos::deep_copy(D_host, D); + const int n_iterations = 10; + KrylovHandleType handle(N, N_team, n_iterations); + KokkosBatched::SerialSpmv::template invoke< typename ValuesViewType::HostMirror, typename IntView::HostMirror, typename VectorViewType::HostMirror, typename VectorViewType::HostMirror, @@ -169,8 +203,8 @@ void impl_test_batched_GMRES(const int N, const int BlkSize, const int N_team) { KokkosBatched::SerialDot::invoke(R_host, R_host, sqr_norm_0_host); Functor_TestBatchedTeamVectorGMRES(D, r, c, X, B, Diag, - N_team) + VectorViewType, KrylovHandleType>( + D, r, c, X, B, Diag, N_team, handle) .run(); Kokkos::fence(); diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 16d54e3dce..c607e74ca8 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -23,7 +23,13 @@ #include "Test_Blas1_sum.hpp" #include "Test_Blas1_update.hpp" +// Serial Blas 1 +#include "Test_Blas1_serial_setscal.hpp" +#include "Test_Blas_serial_axpy.hpp" +#include "Test_Blas_serial_nrm2.hpp" + // Team Blas 1 +#include "Test_Blas1_team_setscal.hpp" #include "Test_Blas1_team_abs.hpp" #include "Test_Blas1_team_axpby.hpp" #include "Test_Blas1_team_axpy.hpp" @@ -44,6 +50,9 @@ #include "Test_Blas3_trmm.hpp" #include "Test_Blas3_trsm.hpp" +// Stuff that should move later on +#include "Test_Blas_Newton.hpp" + // TPLs #include "Test_Blas_rocblas.hpp" diff --git a/unit_test/blas/Test_Blas1_dot.hpp b/unit_test/blas/Test_Blas1_dot.hpp index b2e3f95628..83dfd6048c 100644 --- a/unit_test/blas/Test_Blas1_dot.hpp +++ b/unit_test/blas/Test_Blas1_dot.hpp @@ -111,6 +111,7 @@ void impl_test_dot_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::dot(r, a, b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result[k], @@ -118,6 +119,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, c_a, c_b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_const_result = r(k); EXPECT_NEAR_KK(const_const_result, expected_result[k], @@ -125,6 +127,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, a, c_b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA non_const_const_result = r(k); EXPECT_NEAR_KK(non_const_const_result, expected_result[k], @@ -132,6 +135,7 @@ void impl_test_dot_mv(int N, int K) { } KokkosBlas::dot(r, c_a, b); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_non_const_result = r(k); EXPECT_NEAR_KK(const_non_const_result, expected_result[k], diff --git a/unit_test/blas/Test_Blas1_iamax.hpp b/unit_test/blas/Test_Blas1_iamax.hpp index 88c21be83c..82f1fc1c76 100644 --- a/unit_test/blas/Test_Blas1_iamax.hpp +++ b/unit_test/blas/Test_Blas1_iamax.hpp @@ -61,6 +61,7 @@ void impl_test_iamax(int N) { ViewType0D r("Iamax::Result 0-D View on host"); KokkosBlas::iamax(r, a); + Kokkos::fence(); size_type nonconst_max_loc = r(); ASSERT_EQ(nonconst_max_loc, expected_max_loc); @@ -151,6 +152,7 @@ void impl_test_iamax_mv(int N, int K) { r("Iamax::Result View on host", K); KokkosBlas::iamax(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { size_type nonconst_result = r(k); @@ -159,6 +161,7 @@ void impl_test_iamax_mv(int N, int K) { } KokkosBlas::iamax(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { size_type const_result = r(k); diff --git a/unit_test/blas/Test_Blas1_nrm1.hpp b/unit_test/blas/Test_Blas1_nrm1.hpp index c68492b6dd..1c476cbf43 100644 --- a/unit_test/blas/Test_Blas1_nrm1.hpp +++ b/unit_test/blas/Test_Blas1_nrm1.hpp @@ -98,6 +98,7 @@ void impl_test_nrm1_mv(int N, int K) { KokkosBlas::nrm1(r, a); KokkosBlas::nrm1(c_r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { EXPECT_NEAR_KK(r(k), expected_result(k), eps * expected_result(k)); EXPECT_NEAR_KK(c_r(k), expected_result(k), eps * expected_result(k)); diff --git a/unit_test/blas/Test_Blas1_nrm2.hpp b/unit_test/blas/Test_Blas1_nrm2.hpp index 688035f842..c568b12564 100644 --- a/unit_test/blas/Test_Blas1_nrm2.hpp +++ b/unit_test/blas/Test_Blas1_nrm2.hpp @@ -84,6 +84,7 @@ void impl_test_nrm2_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_result, expected_result[k], @@ -91,6 +92,7 @@ void impl_test_nrm2_mv(int N, int K) { } KokkosBlas::nrm2(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]); diff --git a/unit_test/blas/Test_Blas1_nrm2_squared.hpp b/unit_test/blas/Test_Blas1_nrm2_squared.hpp index 317b9b543b..98c2cf7e8f 100644 --- a/unit_test/blas/Test_Blas1_nrm2_squared.hpp +++ b/unit_test/blas/Test_Blas1_nrm2_squared.hpp @@ -93,6 +93,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { Kokkos::View r("Dot::Result", K); KokkosBlas::nrm2_squared(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type nonconst_result = r(k); typename AT::mag_type divisor = @@ -103,6 +104,7 @@ void impl_test_nrm2_squared_mv(int N, int K) { } KokkosBlas::nrm2_squared(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { typename AT::mag_type const_result = r(k); typename AT::mag_type divisor = diff --git a/unit_test/blas/Test_Blas1_serial_setscal.hpp b/unit_test/blas/Test_Blas1_serial_setscal.hpp new file mode 100644 index 0000000000..2e2a207c47 --- /dev/null +++ b/unit_test/blas/Test_Blas1_serial_setscal.hpp @@ -0,0 +1,246 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBlas1_set.hpp" +#include "KokkosBlas1_scal.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +using namespace KokkosBlas; + +namespace Test { + +enum : int { BlasSet = 0, BlasScale = 1 }; + +struct KokkosKernelTag {}; +struct NaiveTag {}; + +template +struct Functor_TestBlasSerialMatUtil { + ScalarType _alpha; + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialMatUtil(const ScalarType alpha, const ViewType &a) + : _alpha(alpha), _a(a) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); + switch (TestID) { + case BlasSet: KokkosBlas::SerialSet::invoke(_alpha, A); break; + case BlasScale: KokkosBlas::SerialScale::invoke(_alpha, A); break; + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + // MD Note: changing because of the error with -werror + auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = A.extent(0), n = A.extent(1); + switch (TestID) { + case BlasSet: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) = _alpha; + break; + } + case BlasScale: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) *= _alpha; + break; + } + } + } + + inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBlas::Test::SerialMatUtil"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = + (TestID == BlasSet ? "Set" + : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _a.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return 0; + } +}; + +template +void impl_test_blas_matutil(const int N, const int BlkSize) { + /// typedefs + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType a("a", N, BlkSize, BlkSize); + ViewType b("b", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(a, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(b, a); + + /// test body + Functor_TestBlasSerialMatUtil(alpha, a) + .run(); + Functor_TestBlasSerialMatUtil(alpha, b) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + + Kokkos::deep_copy(a_host, a); + Kokkos::deep_copy(b_host, b); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); +} +} // namespace Test + +template +int test_blas_matutil() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_matutil(0, + 10); + Test::impl_test_blas_matutil(10, + 15); + Test::impl_test_blas_matutil(1024, + 9); + Test::impl_test_blas_matutil( + 132231, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_matutil(0, + 10); + Test::impl_test_blas_matutil(10, + 15); + Test::impl_test_blas_matutil(1024, + 9); + Test::impl_test_blas_matutil( + 132231, 3); + } +#endif + + return 0; +} + +// Real test cases + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, blas_scalar_serial_set_float_float) { + test_blas_matutil(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_float_float) { + test_blas_matutil(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, blas_scalar_serial_set_double_double) { + test_blas_matutil(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_double_double) { + test_blas_matutil(); +} +#endif + +// Complex test cases + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_dcomplex) { + test_blas_matutil, + Kokkos::complex, ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_dcomplex) { + test_blas_matutil, + Kokkos::complex, ::Test::BlasScale>(); +} +TEST_F(TestCategory, blas_scalar_serial_set_dcomplex_double) { + test_blas_matutil, double, + ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_serial_scale_dcomplex_double) { + test_blas_matutil, double, + ::Test::BlasScale>(); +} +#endif diff --git a/unit_test/blas/Test_Blas1_sum.hpp b/unit_test/blas/Test_Blas1_sum.hpp index 2b7f51370e..5ad2ef038b 100644 --- a/unit_test/blas/Test_Blas1_sum.hpp +++ b/unit_test/blas/Test_Blas1_sum.hpp @@ -73,6 +73,7 @@ void impl_test_sum_mv(int N, int K) { Kokkos::View r("Sum::Result", K); KokkosBlas::sum(r, a); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA nonconst_result = r(k); EXPECT_NEAR_KK(nonconst_result, expected_result[k], @@ -80,6 +81,7 @@ void impl_test_sum_mv(int N, int K) { } KokkosBlas::sum(r, c_a); + Kokkos::fence(); for (int k = 0; k < K; k++) { ScalarA const_result = r(k); EXPECT_NEAR_KK(const_result, expected_result[k], eps * expected_result[k]); diff --git a/unit_test/blas/Test_Blas1_team_setscal.hpp b/unit_test/blas/Test_Blas1_team_setscal.hpp new file mode 100644 index 0000000000..394c7b6c2d --- /dev/null +++ b/unit_test/blas/Test_Blas1_team_setscal.hpp @@ -0,0 +1,259 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \author Kyungjoo Kim (kyukim@sandia.gov) + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" + +#include "KokkosBlas1_set.hpp" +#include "KokkosBlas1_scal.hpp" + +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +namespace TeamMatUtil { + +enum : int { BlasSet = 0, BlasScale = 1 }; + +struct KokkosKernelTag {}; +struct NaiveTag {}; + +template +struct Functor_TestBlasTeamMatUtil { + ScalarType _alpha; + ViewType _a; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasTeamMatUtil(const ScalarType alpha, const ViewType &a) + : _alpha(alpha), _a(a) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const KokkosKernelTag &, + const MemberType &member) const { + const int i = member.league_rank(); + auto A = Kokkos::subview(_a, i, Kokkos::ALL(), Kokkos::ALL()); + switch (TestID) { + case BlasSet: + KokkosBlas::TeamSet::invoke(member, _alpha, A); + break; + case BlasScale: + KokkosBlas::TeamScale::invoke(member, _alpha, A); + break; + } + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const NaiveTag &, + const MemberType &member) const { + if (member.team_rank() == 0) { + const int k = member.league_rank(); + auto A = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = A.extent(0), n = A.extent(1); + switch (TestID) { + case BlasSet: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) = _alpha; + break; + } + case BlasScale: { + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) A(i, j) *= _alpha; + break; + } + } + } + } + + inline int run() { + typedef typename ViewType::value_type value_type; + std::string name_region("KokkosBlas::Test::SerialMatUtil"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = + (TestID == BlasSet ? "Set" + : TestID == BlasScale ? "Scale" : "UnknownTest"); + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + + const int league_size = _a.extent(0); + Kokkos::TeamPolicy policy(league_size, + Kokkos::AUTO); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + + return 0; + } +}; + +template +void impl_test_blas_matutil(const int N, const int BlkSize) { + /// typedefs + typedef typename ViewType::value_type value_type; + typedef Kokkos::Details::ArithTraits ats; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType a("a", N, BlkSize, BlkSize); + ViewType b("b", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(a, random, value_type(1.0)); + + Kokkos::fence(); + + Kokkos::deep_copy(b, a); + + /// test body + Functor_TestBlasTeamMatUtil(alpha, a) + .run(); + Functor_TestBlasTeamMatUtil(alpha, b) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror a_host = Kokkos::create_mirror_view(a); + typename ViewType::HostMirror b_host = Kokkos::create_mirror_view(b); + + Kokkos::deep_copy(a_host, a); + Kokkos::deep_copy(b_host, b); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(b_host(k, i, j), a_host(k, i, j), eps); +} +} // namespace TeamMatUtil +} // namespace Test + +template +int test_blas_team_matutil() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::TeamMatUtil::impl_test_blas_matutil(0, 10); + Test::TeamMatUtil::impl_test_blas_matutil(10, 15); + Test::TeamMatUtil::impl_test_blas_matutil(1024, 9); + Test::TeamMatUtil::impl_test_blas_matutil(132231, 3); + } +#endif + + return 0; +} + +// Real test cases + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, blas_scalar_team_set_float_float) { + test_blas_team_matutil(); +} +TEST_F(TestCategory, blas_scalar_team_scale_float_float) { + test_blas_team_matutil(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, blas_scalar_team_set_double_double) { + test_blas_team_matutil(); +} +TEST_F(TestCategory, blas_scalar_team_scale_double_double) { + test_blas_team_matutil(); +} +#endif + +// Complex test cases + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, blas_scalar_team_set_dcomplex_dcomplex) { + test_blas_team_matutil, + Kokkos::complex, ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_dcomplex) { + test_blas_team_matutil, + Kokkos::complex, ::Test::BlasScale>(); +} +TEST_F(TestCategory, blas_scalar_team_set_dcomplex_double) { + test_blas_team_matutil, double, + ::Test::BlasSet>(); +} +TEST_F(TestCategory, blas_scalar_team_scale_dcomplex_double) { + test_blas_team_matutil, double, + ::Test::BlasScale>(); +} +#endif diff --git a/unit_test/blas/Test_Blas_Newton.hpp b/unit_test/blas/Test_Blas_Newton.hpp new file mode 100644 index 0000000000..600ba3e0b6 --- /dev/null +++ b/unit_test/blas/Test_Blas_Newton.hpp @@ -0,0 +1,187 @@ +#include + +#include +#include + +namespace Test { + +// Logistic equation +// dy/dt=y(1-y) +// +// solution y = 1/(1+exp(-t)) +// y(0)=0.5 +// +// Using BDF1 to integrate: +// y-y_n=dt*y*(1-y) +// +// Residual: r = y - y_n - dt*y*(1-y) +// Jacobian: J = 1 - dt + 2*dt*y +template +struct LogisticEquation { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + const int neqs = 1; + scalar_type dt; + vec_type state; + + LogisticEquation(const scalar_type dt_, vec_type initial_state) + : dt(dt_), state(initial_state) {} + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { + dydt(0) = y(0) - state(0) - dt * y(0) * (1 - y(0)); + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 1 - dt + 2 * dt * y(0); + } + + KOKKOS_FUNCTION scalar_type expected_val(const scalar_type t) const { + using Kokkos::exp; + + return static_cast(1 / (1 + exp(-t))); + } + + KOKKOS_FUNCTION int num_equations() const { return neqs; } +}; + +// Intersection of square and hyperbola +// x^2 + y^2 = 20 +// x^2 - y^2 = -2 +// +// solution: x = +/- 3 +// y = +/- sqrt(11) +// +// Residual: r = [x^2 + y^2 - 20] +// [x^2 - y^2 + 2] +// Jacobian: J = [2*x, 2*y] +// [2*x, -2*y] +template +struct Intersection { + using vec_type = Kokkos::View; + using mat_type = Kokkos::View; + + const int neqs = 2; + + Intersection() = default; + + KOKKOS_FUNCTION void residual(const vec_type& y, const vec_type& dydt) const { + dydt(0) = y(0) * y(0) + y(1) * y(1) - 20; + dydt(1) = y(0) * y(0) - y(1) * y(1) + 2; + } + + KOKKOS_FUNCTION void jacobian(const vec_type& y, const mat_type& jac) const { + jac(0, 0) = 2 * y(0); + jac(0, 1) = 2 * y(1); + jac(1, 0) = 2 * y(0); + jac(1, 1) = -2 * y(1); + } + + KOKKOS_FUNCTION int num_equations() const { return neqs; } +}; + +template +struct NewtonWrapper { + solver newton_solver; + + NewtonWrapper(solver newton_solver_) : newton_solver(newton_solver_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const int /* system_index */) const { newton_solver.solve(); } +}; + +template +int test_logistic() { + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + using norm_type = typename Kokkos::View; + using handle_type = KokkosBlas::Impl::NewtonHandle; + using system_type = LogisticEquation; + using newton_type = + KokkosBlas::Impl::NewtonFunctor; + + // Create the non-linear system and initialize data + vec_type state("state", 1); + Kokkos::deep_copy(state, 0.5); + system_type ode(0.1, state); + + vec_type x("solution vector", 1), rhs("right hand side vector", 1); + Kokkos::deep_copy(x, 0.5); + + // Create the solver and wrapper + handle_type handle; + handle.debug_mode = false; + newton_type newton_solver(ode, x, rhs, handle); + NewtonWrapper wrapper(newton_solver); + + // Launch the problem in a parallel_for + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::parallel_for(my_policy, wrapper); + + // Get the solution back and test it + auto x_h = Kokkos::create_mirror_view(x); + Kokkos::deep_copy(x_h, x); + printf("Non-linear problem solution:\n"); + printf(" [%f]\n", x_h(0)); + + return 0; +} + +template +int test_intersection() { + using vec_type = typename Kokkos::View; + using mat_type = typename Kokkos::View; + using norm_type = typename Kokkos::View; + using handle_type = KokkosBlas::Impl::NewtonHandle; + using system_type = Intersection; + using newton_type = + KokkosBlas::Impl::NewtonFunctor; + + // Create the non-linear system and initialize data + system_type intersection; + vec_type x("solution vector", 2), rhs("right hand side vector", 2); + { + typename vec_type::HostMirror x_h = Kokkos::create_mirror_view(x); + x_h(0) = 2.5; + x_h(1) = 3.0; + Kokkos::deep_copy(x, x_h); + } + + // Create the solver and wrapper + handle_type handle; + handle.debug_mode = false; + newton_type newton_solver(intersection, x, rhs, handle); + NewtonWrapper wrapper(newton_solver); + + // Launch the problem in a parallel_for + Kokkos::RangePolicy my_policy(0, 1); + Kokkos::parallel_for(my_policy, wrapper); + + // Get the solution back and test it + auto x_h = Kokkos::create_mirror_view(x); + Kokkos::deep_copy(x_h, x); + printf("Non-linear problem solution:\n"); + for (int idx = 0; idx < x_h.extent_int(0); ++idx) { + printf(" [%f]\n", x_h(idx)); + } + EXPECT_NEAR_KK(x_h(0), 3.0, 3.0e-4); + EXPECT_NEAR_KK(x_h(1), 3.3166247903553998, 3.3166247903553998 * 1.0e-4); + + return 0; +} + +} // namespace Test + +template +int test_newton() { + Test::test_logistic(); + Test::test_intersection(); + + return 1; +} + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, newton_serial) { test_newton(); } +#endif diff --git a/unit_test/blas/Test_Blas_serial_axpy.hpp b/unit_test/blas/Test_Blas_serial_axpy.hpp new file mode 100644 index 0000000000..83892640a7 --- /dev/null +++ b/unit_test/blas/Test_Blas_serial_axpy.hpp @@ -0,0 +1,218 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_BLAS_SERIAL_AXPY_HPP_ +#define TEST_BLAS_SERIAL_AXPY_HPP_ + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosBlas1_axpby.hpp" + +namespace Test { + +struct KokkosKernelAxpyTag {}; +struct NaiveAxpyTag {}; + +template +struct Functor_TestBlasSerialAxpy { + ScalarType _alpha; + ViewType _x; + ViewType _y; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialAxpy(const ScalarType alpha, const ViewType &x, + const ViewType &y) + : _alpha(alpha), _x(x), _y(y) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelAxpyTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL()); + auto Y = Kokkos::subview(_y, i, Kokkos::ALL(), Kokkos::ALL()); + KokkosBlas::serial_axpy(_alpha, X, Y); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveAxpyTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL()); + auto Y = Kokkos::subview(_y, k, Kokkos::ALL(), Kokkos::ALL()); + const int m = X.extent(0), n = X.extent(1); + for (int i = 0; i < m; ++i) + for (int j = 0; j < n; ++j) Y(i, j) += _alpha * X(i, j); + } + + inline void run() { + using value_type = typename ViewType::value_type; + std::string name_region("KokkosBlas::Test::SerialAxpy"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value + ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Axpy"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +void impl_test_blas_serial_axpy(const int N, const int BlkSize) { + /// typedefs + using value_type = typename ViewType::value_type; + using ats = Kokkos::ArithTraits; + + /// radomized input testing views + const ScalarType alpha = 11.1; + ViewType X("X", N, BlkSize, BlkSize); + ViewType Y("Y", N, BlkSize, BlkSize); + ViewType Yref("Yref", N, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random( + 13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fill_random(Y, random, ats::one()); + Kokkos::fence(); + Kokkos::deep_copy(Yref, Y); + + /// test body + Functor_TestBlasSerialAxpy( + alpha, X, Yref) + .run(); + Functor_TestBlasSerialAxpy(alpha, X, Y) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename ViewType::HostMirror Y_host = Kokkos::create_mirror_view(Y); + typename ViewType::HostMirror Yref_host = Kokkos::create_mirror_view(Yref); + + Kokkos::deep_copy(Y_host, Y); + Kokkos::deep_copy(Yref_host, Yref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + EXPECT_NEAR_KK(Y_host(k, i, j), Yref_host(k, i, j), eps); +} + +} // namespace Test + +template +int test_blas_serial_axpy() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_serial_axpy(0, 10); + Test::impl_test_blas_serial_axpy(10, 15); + Test::impl_test_blas_serial_axpy(1024, 9); + Test::impl_test_blas_serial_axpy(132231, + 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + typedef Kokkos::View + ViewType; + Test::impl_test_blas_serial_axpy(0, 10); + Test::impl_test_blas_serial_axpy(10, 15); + Test::impl_test_blas_serial_axpy(1024, 9); + Test::impl_test_blas_serial_axpy(132231, + 3); + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_axpy_float_float) { + test_blas_serial_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_axpy_double_double) { + test_blas_serial_axpy(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, serial_axpy_dcomplex_dcomplex) { + test_blas_serial_axpy, + Kokkos::complex >(); +} + +TEST_F(TestCategory, serial_axpy_dcomplex_double) { + test_blas_serial_axpy, double>(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, serial_axpy_fcomplex_fcomplex) { + test_blas_serial_axpy, + Kokkos::complex >(); +} + +TEST_F(TestCategory, serial_axpy_fcomplex_float) { + test_blas_serial_axpy, float>(); +} +#endif + +#endif // TEST_BLAS_SERIAL_AXPY_HPP_ diff --git a/unit_test/blas/Test_Blas_serial_nrm2.hpp b/unit_test/blas/Test_Blas_serial_nrm2.hpp new file mode 100644 index 0000000000..1a2721e782 --- /dev/null +++ b/unit_test/blas/Test_Blas_serial_nrm2.hpp @@ -0,0 +1,316 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_BLAS_SERIAL_NRM2_HPP_ +#define TEST_BLAS_SERIAL_NRM2_HPP_ + +#include "gtest/gtest.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_Random.hpp" +#include "KokkosKernels_TestUtils.hpp" + +#include "KokkosBlas1_nrm2.hpp" + +namespace Test { + +template +struct Functor_TestBlasSerialNrm2 { + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + ViewType _x; + norm_view_type _nrm; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialNrm2(const ViewType &x, const norm_view_type &nrm) + : _x(x), _nrm(nrm) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL()); + _nrm(i) = KokkosBlas::serial_nrm2(X); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL()); + _nrm(k) = Kokkos::ArithTraits::zero(); + for (int i = 0; i < X.extent_int(0); ++i) { + _nrm(k) += IPT::norm(IPT::dot(X(i), X(i))); + } + + _nrm(k) = Kokkos::ArithTraits::sqrt(_nrm(k)); + } + + inline void run() { + std::string name_region("KokkosBlas::Test::SerialNrm2"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +struct Functor_TestBlasSerialNrm2MV { + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + ViewType _x; + norm_view_type _nrm; + + KOKKOS_INLINE_FUNCTION + Functor_TestBlasSerialNrm2MV(const ViewType &x, const norm_view_type &nrm) + : _x(x), _nrm(nrm) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const KokkosKernelTag &, const int i) const { + auto X = Kokkos::subview(_x, i, Kokkos::ALL(), Kokkos::ALL()); + auto R = Kokkos::subview(_nrm, i, Kokkos::ALL()); + KokkosBlas::serial_nrm2(X, R); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NaiveTag &, const int k) const { + auto X = Kokkos::subview(_x, k, Kokkos::ALL(), Kokkos::ALL()); + auto R = Kokkos::subview(_nrm, k, Kokkos::ALL()); + + for (int colIdx = 0; colIdx < X.extent_int(1); ++colIdx) { + R(colIdx) = Kokkos::ArithTraits::zero(); + for (int rowIdx = 0; rowIdx < X.extent_int(0); ++rowIdx) { + R(colIdx) += IPT::norm(IPT::dot(X(rowIdx, colIdx), X(rowIdx, colIdx))); + } + R(colIdx) = Kokkos::ArithTraits::sqrt(R(colIdx)); + } + } + + inline void run() { + std::string name_region("KokkosBlas::Test::SerialNrm2MV"); + const std::string name_value_type = Test::value_type_name(); + std::string name_work_tag = + (std::is_same::value + ? "::KokkosBlas" + : std::is_same::value ? "::Naive" + : "::UnknownWorkTag"); + std::string name_test_id = "Nrm2"; + std::string name = + name_region + name_value_type + name_work_tag + name_test_id; + Kokkos::Profiling::pushRegion(name.c_str()); + Kokkos::RangePolicy policy(0, _x.extent(0)); + Kokkos::parallel_for(name.c_str(), policy, *this); + Kokkos::Profiling::popRegion(); + return; + } +}; + +template +void impl_test_blas_serial_nrm2(const int N, const int BlkSize) { + /// typedefs + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using ats = Kokkos::ArithTraits; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + /// radomized input testing views + ViewType X("X", N, BlkSize); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fence(); + + norm_view_type norms("norms", N); + norm_view_type norms_ref("ref norms", N); + + /// test body + Functor_TestBlasSerialNrm2(X, norms).run(); + Functor_TestBlasSerialNrm2(X, + norms_ref) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename norm_view_type::HostMirror norms_host = + Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = + Kokkos::create_mirror_view(norms_ref); + + Kokkos::deep_copy(norms_host, norms); + Kokkos::deep_copy(norms_ref_host, norms_ref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + EXPECT_NEAR_KK(norms_host(k), norms_ref_host(k), eps); +} + +template +void impl_test_blas_serial_nrm2mv(const int N, const int vecLength, + const int numVecs) { + /// typedefs + using execution_space = typename DeviceType::execution_space; + using value_type = typename ViewType::non_const_value_type; + using ats = Kokkos::ArithTraits; + using IPT = Kokkos::Details::InnerProductSpaceTraits; + using norm_type = typename IPT::mag_type; + using norm_view_type = Kokkos::View; + + /// radomized input testing views + ViewType X("X", N, vecLength, numVecs); + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(X, random, ats::one()); + Kokkos::fence(); + + norm_view_type norms("norms", N, numVecs); + norm_view_type norms_ref("ref norms", N, numVecs); + + /// test body + Functor_TestBlasSerialNrm2MV(X, norms).run(); + Functor_TestBlasSerialNrm2MV(X, + norms_ref) + .run(); + + Kokkos::fence(); + + /// for comparison send it to host + typename norm_view_type::HostMirror norms_host = + Kokkos::create_mirror_view(norms); + typename norm_view_type::HostMirror norms_ref_host = + Kokkos::create_mirror_view(norms_ref); + + Kokkos::deep_copy(norms_host, norms); + Kokkos::deep_copy(norms_ref_host, norms_ref); + + /// check a = b + typename ats::mag_type eps = + 100 * std::numeric_limits::epsilon(); + for (int k = 0; k < N; ++k) + for (int vecIdx = 0; vecIdx < numVecs; ++vecIdx) + EXPECT_NEAR_KK(norms_host(k, vecIdx), norms_ref_host(k, vecIdx), eps); +} + +} // namespace Test + +template +int test_blas_serial_nrm2() { +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + { + using ViewType = Kokkos::View; + Test::impl_test_blas_serial_nrm2(0, 10); + Test::impl_test_blas_serial_nrm2(10, 15); + Test::impl_test_blas_serial_nrm2(1024, 9); + Test::impl_test_blas_serial_nrm2(132231, 3); + + using MVViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2mv(0, 10, 5); + Test::impl_test_blas_serial_nrm2mv(10, 15, 7); + Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); + Test::impl_test_blas_serial_nrm2mv(132231, 3, 3); + } +#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + { + using ViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2(0, 10); + Test::impl_test_blas_serial_nrm2(10, 15); + Test::impl_test_blas_serial_nrm2(1024, 9); + Test::impl_test_blas_serial_nrm2(132231, 3); + + using MVViewType = + Kokkos::View; + Test::impl_test_blas_serial_nrm2mv(0, 10, 5); + Test::impl_test_blas_serial_nrm2mv(10, 15, 5); + Test::impl_test_blas_serial_nrm2mv(1024, 9, 5); + Test::impl_test_blas_serial_nrm2mv(132231, 3, 3); + } +#endif + + return 0; +} + +#if defined(KOKKOSKERNELS_INST_FLOAT) +TEST_F(TestCategory, serial_nrm2_float_float) { + test_blas_serial_nrm2(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_DOUBLE) +TEST_F(TestCategory, serial_nrm2_double_double) { + test_blas_serial_nrm2(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_FLOAT) +TEST_F(TestCategory, serial_nrm2_fcomplex_float) { + test_blas_serial_nrm2 >(); +} +#endif + +#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) +TEST_F(TestCategory, serial_nrm2_dcomplex_dcomplex) { + test_blas_serial_nrm2 >(); +} +#endif + +#endif // TEST_BLAS_SERIAL_NRM2_HPP_ diff --git a/unit_test/common/Test_Common.hpp b/unit_test/common/Test_Common.hpp index 0a194071a8..20b875f4a5 100644 --- a/unit_test/common/Test_Common.hpp +++ b/unit_test/common/Test_Common.hpp @@ -1,15 +1,12 @@ #ifndef TEST_COMMON_HPP #define TEST_COMMON_HPP -// FIXME_SYCL still some uses of the wrong namespace -#ifndef KOKKOS_ENABLE_SYCL #include -#endif // #include #include #include -#include #include #include +#include #endif // TEST_COMMON_HPP diff --git a/unit_test/common/Test_Common_ArithTraits.hpp b/unit_test/common/Test_Common_ArithTraits.hpp index 38a6ba7d78..19b0ce9d15 100644 --- a/unit_test/common/Test_Common_ArithTraits.hpp +++ b/unit_test/common/Test_Common_ArithTraits.hpp @@ -163,8 +163,8 @@ class ArithTraitsTesterBase { /// \brief Combine two intermediate reduction results into \c dst. /// /// Subclasses need not and must not override this method. - KOKKOS_INLINE_FUNCTION void join(volatile value_type& dst, - const volatile value_type& src) const { + KOKKOS_INLINE_FUNCTION void join(value_type& dst, + const value_type& src) const { dst = dst && src; // dst = 1; } @@ -1722,6 +1722,10 @@ int runAllArithTraitsHostTests(std::ostream& out, const int verbose) { // testArithTraitsOnHost, DeviceType> (out, // verbose); +#if defined(KOKKOS_ENABLE_LIBQUADMATH) + success = success && curSuccess; + curSuccess = testArithTraitsOnHost<__float128, DeviceType>(out, verbose); +#endif return success && curSuccess; } diff --git a/unit_test/common/Test_Common_Controls.hpp b/unit_test/common/Test_Common_Controls.hpp new file mode 100644 index 0000000000..48c2a96715 --- /dev/null +++ b/unit_test/common/Test_Common_Controls.hpp @@ -0,0 +1,72 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef TEST_COMMON_CONTROLS_HPP +#define TEST_COMMON_CONTROLS_HPP + +#include "KokkosKernels_Controls.hpp" + +void test_controls_empty() { + KokkosKernels::Experimental::Controls c; + EXPECT_EQ(c.isParameter(""), false); + EXPECT_EQ(c.getParameter(""), ""); + EXPECT_EQ(c.getParameter("", "default"), "default"); +} + +void test_controls_set() { + KokkosKernels::Experimental::Controls c; + c.setParameter("key", "value"); + EXPECT_EQ(c.isParameter("key"), true); + EXPECT_EQ(c.getParameter("key"), "value"); + EXPECT_EQ(c.getParameter("key", "default"), "value"); + + EXPECT_EQ(c.isParameter(""), false); + EXPECT_EQ(c.getParameter(""), ""); + EXPECT_EQ(c.getParameter("", "default"), "default"); +} + +TEST_F(TestCategory, controls_empty) { test_controls_empty(); } +TEST_F(TestCategory, controls_set) { test_controls_set(); } + +#endif // TEST_COMMON_CONTROLS_HPP diff --git a/unit_test/common/Test_Common_Sorting.hpp b/unit_test/common/Test_Common_Sorting.hpp index 1580a0c98b..f0320cb637 100644 --- a/unit_test/common/Test_Common_Sorting.hpp +++ b/unit_test/common/Test_Common_Sorting.hpp @@ -525,226 +525,6 @@ void testBitonicSortLexicographic() { ASSERT_TRUE(ordered); } -template -void testSortCRS(default_lno_t numRows, default_lno_t numCols, - default_size_type nnz, bool doValues, bool doStructInterface) { - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - // Create a random matrix on device - // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this - // wouldn't test anything - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 2, numCols / 2); - auto rowmap = A.graph.row_map; - auto entries = A.graph.entries; - auto values = A.values; - Kokkos::View rowmapHost("rowmap host", - numRows + 1); - Kokkos::View entriesHost("sorted entries host", - nnz); - Kokkos::View valuesHost("sorted values host", - nnz); - Kokkos::deep_copy(rowmapHost, rowmap); - Kokkos::deep_copy(entriesHost, entries); - Kokkos::deep_copy(valuesHost, values); - struct ColValue { - ColValue() {} - ColValue(lno_t c, scalar_t v) : col(c), val(v) {} - bool operator<(const ColValue& rhs) const { return col < rhs.col; } - bool operator==(const ColValue& rhs) const { - return col == rhs.col && val == rhs.val; - } - lno_t col; - scalar_t val; - }; - // sort one row at a time on host using STL. - { - for (lno_t i = 0; i < numRows; i++) { - std::vector rowCopy; - for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++) - rowCopy.emplace_back(entriesHost(j), valuesHost(j)); - std::sort(rowCopy.begin(), rowCopy.end()); - // write sorted row back - for (size_t j = 0; j < rowCopy.size(); j++) { - entriesHost(rowmapHost(i) + j) = rowCopy[j].col; - valuesHost(rowmapHost(i) + j) = rowCopy[j].val; - } - } - } - // call the actual sort routine being tested - if (doValues) { - if (doStructInterface) { - KokkosKernels::sort_crs_matrix(A); - } else { - KokkosKernels::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); - } - } else { - if (doStructInterface) { - KokkosKernels::sort_crs_graph(A.graph); - } else { - KokkosKernels::sort_crs_graph( - A.graph.row_map, A.graph.entries); - } - } - // Copy to host and compare - Kokkos::View entriesOut("sorted entries host", - nnz); - Kokkos::View valuesOut("sorted values host", - nnz); - Kokkos::deep_copy(entriesOut, entries); - Kokkos::deep_copy(valuesOut, values); - for (size_type i = 0; i < nnz; i++) { - EXPECT_EQ(entriesHost(i), entriesOut(i)) - << "Sorted column indices are wrong!"; - if (doValues) { - EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!"; - } - } -} - -template -void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { - // This test is about bug #960. - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix, - size_type>; - using crsMat_Managed_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - const lno_t numRows = 50; - const lno_t numCols = numRows; - size_type nnz = numRows * 5; - // Create a random matrix on device - // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this - // wouldn't test anything - crsMat_Managed_t A_managed = - KokkosKernels::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 2, numCols / 2); - crsMat_t A(A_managed); - auto rowmap = A.graph.row_map; - auto entries = A.graph.entries; - auto values = A.values; - if (doValues) { - if (doStructInterface) { - KokkosKernels::sort_crs_matrix(A); - } else { - KokkosKernels::sort_crs_matrix( - A.graph.row_map, A.graph.entries, A.values); - } - } else { - if (doStructInterface) { - KokkosKernels::sort_crs_graph(A.graph); - } else { - KokkosKernels::sort_crs_graph( - A.graph.row_map, A.graph.entries); - } - } -} - -template -void testSortAndMerge() { - using size_type = default_size_type; - using lno_t = default_lno_t; - using scalar_t = default_scalar; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = - KokkosSparse::CrsMatrix; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type; - using values_t = typename crsMat_t::values_type; - using Kokkos::HostSpace; - using Kokkos::MemoryTraits; - using Kokkos::Unmanaged; - // Create a small CRS matrix on host - std::vector inRowmap = {0, 4, 4, 5, 7, 10}; - std::vector inEntries = { - 4, 3, 5, 3, // row 0 - // row 1 has no entries - 6, // row 2 - 2, 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector inValues = { - 1.5, 4, 1, -3, // row 0 - // row 1 - 2, // row 2 - -1, -2, // row 3 - 0, 3.5, -2.25 // row 4 - }; - lno_t nrows = 5; - lno_t ncols = 7; - size_type nnz = inEntries.size(); - Kokkos::View> hostInRowmap( - inRowmap.data(), nrows + 1); - Kokkos::View> hostInEntries( - inEntries.data(), nnz); - Kokkos::View> hostInValues( - inValues.data(), nnz); - rowmap_t devInRowmap("", nrows + 1); - entries_t devInEntries("", nnz); - values_t devInValues("", nnz); - Kokkos::deep_copy(devInRowmap, hostInRowmap); - Kokkos::deep_copy(devInEntries, hostInEntries); - Kokkos::deep_copy(devInValues, hostInValues); - crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, - devInEntries); - crsMat_t output = KokkosKernels::sort_and_merge_matrix(input); - exec_space().fence(); - EXPECT_EQ(output.numRows(), nrows); - EXPECT_EQ(output.numCols(), ncols); - auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - output.graph.row_map); - auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), - output.graph.entries); - auto outValues = - Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); - // Expect 2 merges to have taken place - std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; - std::vector goldEntries = { - 3, 4, 5, // row 0 - // row 1 has no entries - 6, // row 2 - 2, // row 3 - 0, 1, 2 // row 4 - }; - // note: choosing values that can be represented exactly by float - std::vector goldValues = { - 1, 1.5, 1, // row 0 - // row 1 - 2, // row 2 - -3, // row 3 - 0, 3.5, -2.25 // row 4 - }; - EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); - EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); - EXPECT_EQ(goldValues.size(), outValues.extent(0)); - EXPECT_EQ(goldValues.size(), output.nnz()); - for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); - for (size_type i = 0; i < output.nnz(); i++) { - EXPECT_EQ(goldEntries[i], outEntries(i)); - EXPECT_EQ(goldValues[i], outValues(i)); - } -} - TEST_F(TestCategory, common_serial_radix) { // Test serial radix over some contiguous small arrays // 1st arg is #arrays, 2nd arg is max subarray size @@ -805,31 +585,4 @@ TEST_F(TestCategory, common_device_bitonic) { testBitonicSortLexicographic(); } -TEST_F(TestCategory, common_sort_crsgraph) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, false, doStructInterface); - testSortCRS(100, 100, 2000, false, doStructInterface); - testSortCRS(1000, 1000, 30000, false, doStructInterface); - testSortCRSUnmanaged(false, doStructInterface); - } -} - -TEST_F(TestCategory, common_sort_crsmatrix) { - for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { - testSortCRS(10, 10, 20, true, doStructInterface); - testSortCRS(100, 100, 2000, true, doStructInterface); - testSortCRS(1000, 1000, 30000, true, doStructInterface); - testSortCRSUnmanaged(true, doStructInterface); - } -} - -TEST_F(TestCategory, common_sort_crs_longrows) { - testSortCRS(1, 50000, 10000, false, false); - testSortCRS(1, 50000, 10000, true, false); -} - -TEST_F(TestCategory, common_sort_merge_crsmatrix) { - testSortAndMerge(); -} - #endif diff --git a/unit_test/common/Test_Common_Test_All_Type_Combos.hpp b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp new file mode 100644 index 0000000000..afacb09ee9 --- /dev/null +++ b/unit_test/common/Test_Common_Test_All_Type_Combos.hpp @@ -0,0 +1,190 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Common_Test_All_Type_Combos.hpp + +/** + * KOKKOSKERNELS_EXECUTE_TEST should take (SCALAR, ORDINAL, OFFSET, DEVICE). All + * these args are types. + * #define NO_TEST_COMPLEX to skip testing of kokkos complex types + */ + +#if !defined(KOKKOSKERNELS_EXECUTE_TEST) +#error Test_Common_Test_All_Type_Combos.hpp requires KOKKOSKERNELS_EXECUTE_TEST to be set +#endif + +#if (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) + +// ETI is off, test all possible type combos + +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) + +#if !defined(NO_TEST_COMPLEX) + +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) + +#endif + +#else + +// ETI is on, only test instantiated type combos + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) +#endif + +#if !defined(NO_TEST_COMPLEX) + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) +KOKKOSKERNELS_EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +#endif + +#endif // !NO_TEST_COMPLEX + +#endif // ETI ON diff --git a/unit_test/common/Test_Common_Transpose.hpp b/unit_test/common/Test_Common_Transpose.hpp deleted file mode 100644 index fba29da81d..0000000000 --- a/unit_test/common/Test_Common_Transpose.hpp +++ /dev/null @@ -1,173 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -/// \file Test_Common_Transpose.hpp - -#ifndef KOKKOSKERNELS_TRANSPOSE_HPP -#define KOKKOSKERNELS_TRANSPOSE_HPP - -#include -#include -#include -#include -#include -#include -#include - -template -struct ExactCompare { - ExactCompare(const V& v1_, const V& v2_) : v1(v1_), v2(v2_) {} - - KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const { - if (v1(i) != v2(i)) ldiffs++; - } - - V v1; - V v2; -}; - -template -void testTranspose(int numRows, int numCols, bool doValues) { - using range_pol = Kokkos::RangePolicy; - using scalar_t = default_scalar; - using lno_t = default_lno_t; - using size_type = default_size_type; - using mem_space = typename exec_space::memory_space; - using device_t = Kokkos::Device; - using crsMat_t = typename KokkosSparse::CrsMatrix; - using c_rowmap_t = typename crsMat_t::row_map_type; - using c_entries_t = typename crsMat_t::index_type; - using c_values_t = typename crsMat_t::values_type; - using rowmap_t = typename crsMat_t::row_map_type::non_const_type; - using entries_t = typename crsMat_t::index_type::non_const_type; - using values_t = typename crsMat_t::values_type::non_const_type; - size_type nnz = 10 * numRows; - // Generate a matrix that has 0 entries in some rows - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( - numRows, numCols, nnz, 3 * 10, numRows / 2); - // compute the transpose while unsorted, then transpose again - rowmap_t t_rowmap("Rowmap^T", numCols + 1); // this view is initialized to 0 - entries_t t_entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"), - input_mat.graph.entries.extent(0)); - values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), - input_mat.values.extent(0)); - rowmap_t tt_rowmap("Rowmap^T^T", - numRows + 1); // this view is initialized to 0 - entries_t tt_entries( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"), - input_mat.graph.entries.extent(0)); - values_t tt_values( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), - input_mat.values.extent(0)); - if (doValues) { - KokkosKernels::Impl::transpose_matrix( - numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, - input_mat.values, t_rowmap, t_entries, t_values); - KokkosKernels::Impl::transpose_matrix( - numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries, - tt_values); - } else { - KokkosKernels::Impl::transpose_graph( - numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, - t_rowmap, t_entries); - KokkosKernels::Impl::transpose_graph( - numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries); - } - // Sort both the transpose-transpose, and the original matrix (to compare - // directly) - KokkosKernels::sort_crs_matrix(input_mat); - KokkosKernels::sort_crs_matrix( - tt_rowmap, tt_entries, tt_values); - // The views should now be exactly identical, since they represent the same - // matrix and are sorted - size_type rowmapDiffs; - Kokkos::parallel_reduce( - range_pol(0, numRows + 1), - ExactCompare(input_mat.graph.row_map, tt_rowmap), - rowmapDiffs); - size_type entriesDiffs; - Kokkos::parallel_reduce( - range_pol(0, input_mat.nnz()), - ExactCompare(input_mat.graph.entries, tt_entries), - entriesDiffs); - EXPECT_EQ(size_type(0), rowmapDiffs); - EXPECT_EQ(size_type(0), entriesDiffs); - if (doValues) { - size_type valuesDiffs; - Kokkos::parallel_reduce( - range_pol(0, input_mat.nnz()), - ExactCompare(input_mat.values, tt_values), - valuesDiffs); - EXPECT_EQ(size_type(0), valuesDiffs); - } -} - -TEST_F(TestCategory, common_transpose_matrix) { - // Test both matrix and graph transpose with various sizes - testTranspose(100, 100, true); - testTranspose(500, 50, true); - testTranspose(50, 500, true); - testTranspose(4000, 2000, true); - testTranspose(2000, 4000, true); - testTranspose(2000, 2000, true); -} - -TEST_F(TestCategory, common_transpose_graph) { - testTranspose(100, 100, false); - testTranspose(500, 50, false); - testTranspose(50, 500, false); - testTranspose(4000, 2000, false); - testTranspose(2000, 4000, false); - testTranspose(2000, 2000, false); -} - -#endif diff --git a/unit_test/common/Test_Common_set_bit_count.hpp b/unit_test/common/Test_Common_set_bit_count.hpp index a085cc0024..937a2fdf1b 100644 --- a/unit_test/common/Test_Common_set_bit_count.hpp +++ b/unit_test/common/Test_Common_set_bit_count.hpp @@ -48,13 +48,9 @@ #include "KokkosKernels_BitUtils.hpp" #include "KokkosKernels_SimpleUtils.hpp" #include "KokkosKernels_PrintUtils.hpp" -#include #include #include -#include -#include - // const char *input_filename = "sherman1.mtx"; // const char *input_filename = "Si2.mtx"; // const char *input_filename = "wathen_30_30.mtx"; diff --git a/unit_test/graph/Test_Graph_graph_color.hpp b/unit_test/graph/Test_Graph_graph_color.hpp index ef7c14a931..4d35874657 100644 --- a/unit_test/graph/Test_Graph_graph_color.hpp +++ b/unit_test/graph/Test_Graph_graph_color.hpp @@ -47,9 +47,10 @@ #include "KokkosGraph_Distance1Color.hpp" #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_default_types.hpp" using namespace KokkosKernels; using namespace KokkosKernels::Experimental; @@ -115,7 +116,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, // typedef typename lno_view_t::non_const_value_type size_type; lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); typename lno_view_t::non_const_type sym_xadj; @@ -168,7 +169,7 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, const lno_t num_rows_1 = input_mat.numRows(); const lno_t num_cols_1 = input_mat.numCols(); - lno_t num_conflict = KokkosKernels::Impl::kk_is_d1_coloring_valid< + lno_t num_conflict = KokkosSparse::Impl::kk_is_d1_coloring_valid< lno_view_t, lno_nnz_view_t, color_view_t, typename device::execution_space>( num_rows_1, num_cols_1, input_mat.graph.row_map, @@ -220,31 +221,28 @@ void test_coloring(lno_t numRows, size_type nnz, lno_t bandwidth, defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) #endif -// FIXME_SYCL -#ifndef KOKKOS_ENABLE_SYCL #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif +EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) #endif #undef EXECUTE_TEST diff --git a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp index ec718e9aa4..e2e4a3d227 100644 --- a/unit_test/graph/Test_Graph_graph_color_deterministic.hpp +++ b/unit_test/graph/Test_Graph_graph_color_deterministic.hpp @@ -48,8 +48,9 @@ #include "KokkosGraph_Distance1Color.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_default_types.hpp" using namespace KokkosKernels; using namespace KokkosKernels::Experimental; @@ -274,28 +275,28 @@ void test_coloring_deterministic(lno_t numRows, size_type nnz) { defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, int, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int, size_t, TestExecSpace) #endif #if (defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) +EXECUTE_TEST(default_scalar, int64_t, size_t, TestExecSpace) #endif #undef EXECUTE_TEST diff --git a/unit_test/graph/Test_Graph_graph_color_distance2.hpp b/unit_test/graph/Test_Graph_graph_color_distance2.hpp index 70158941a8..c78e8c2f5f 100644 --- a/unit_test/graph/Test_Graph_graph_color_distance2.hpp +++ b/unit_test/graph/Test_Graph_graph_color_distance2.hpp @@ -49,8 +49,8 @@ #include "KokkosGraph_Distance2Color.hpp" #include "KokkosGraph_MIS2.hpp" #include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" @@ -159,7 +159,7 @@ void test_dist2_coloring(lno_t numVerts, size_type nnz, lno_t bandwidth, KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -216,7 +216,7 @@ void test_bipartite_symmetric(lno_t numVerts, size_type nnz, lno_t bandwidth, KokkosKernelsHandle; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -273,13 +273,13 @@ void test_bipartite(lno_t numRows, lno_t numCols, size_type nnz, KokkosKernelsHandle; // Generate graph - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); auto G = A.graph; rowmap_t t_rowmap("rowmap^T", numCols + 1); entries_t t_entries("entries^T", G.entries.extent(0)); - KokkosKernels::Impl::transpose_graph( + KokkosSparse::Impl::transpose_graph( numRows, numCols, G.row_map, G.entries, t_rowmap, t_entries); // TODO: remove me, shouldn't be needed even with UVM execution_space().fence(); diff --git a/unit_test/graph/Test_Graph_mis2.hpp b/unit_test/graph/Test_Graph_mis2.hpp index ed3acc3b85..c1b5e179fe 100644 --- a/unit_test/graph/Test_Graph_mis2.hpp +++ b/unit_test/graph/Test_Graph_mis2.hpp @@ -50,7 +50,8 @@ #include "KokkosGraph_ExplicitCoarsening.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" @@ -122,7 +123,7 @@ void test_mis2(lno_t numVerts, size_type nnz, lno_t bandwidth, using rowmap_t = typename c_rowmap_t::non_const_type; using entries_t = typename c_entries_t::non_const_type; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph @@ -164,7 +165,7 @@ void test_mis2_coarsening(lno_t numVerts, size_type nnz, lno_t bandwidth, using entries_t = typename c_entries_t::non_const_type; using labels_t = entries_t; // Generate graph, and add some out-of-bounds columns - crsMat A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat A = KokkosSparse::Impl::kk_generate_sparse_matrix( numVerts, numVerts, nnz, row_size_variance, bandwidth); auto G = A.graph; // Symmetrize the graph diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 2afa0fb2db..e75eb1ce6a 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -12,12 +12,17 @@ #include "Test_Sparse_spadd.hpp" #include "Test_Sparse_spgemm_jacobi.hpp" #include "Test_Sparse_spgemm.hpp" +#include "Test_Sparse_bspgemm.hpp" +#include "Test_Sparse_SortCrs.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" -//#include "Test_Sparse_spmv_blockcrs.hpp" -//#include "Test_Sparse_spmv_bsr.hpp" +#include "Test_Sparse_spmv_blockcrs.hpp" +#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" +#include "Test_Sparse_Transpose.hpp" +#include "Test_Sparse_TestUtils_RandCscMat.hpp" +#include "Test_Sparse_csc2csr.hpp" // TPL specific tests, these require // particular pairs of backend and TPL diff --git a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp index e87514c3c6..6eb4488c72 100644 --- a/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BlockCrsMatrix.hpp @@ -372,139 +372,13 @@ void testBlockCrsMatrix() { } } -#define EXECUTE_BLOCKCRS_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##blkcrsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBlockCrsMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BLOCKCRS_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BLOCKCRS_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp index 49a0ce6d4f..501ebc2ead 100644 --- a/unit_test/sparse/Test_Sparse_BsrMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_BsrMatrix.hpp @@ -374,138 +374,12 @@ void testBsrMatrix() { } } -#define EXECUTE_BSR_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##bsrmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testBsrMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BSR_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp index 6f67a6e8bb..8a85e43670 100644 --- a/unit_test/sparse/Test_Sparse_CrsMatrix.hpp +++ b/unit_test/sparse/Test_Sparse_CrsMatrix.hpp @@ -245,7 +245,7 @@ void testCrsMatrixHostMirror() { EXPECT_EQ(zeroHost.graph.row_map.extent(0), 0); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##crsmatrix##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testCrsMatrix(); \ @@ -257,132 +257,6 @@ void testCrsMatrixHostMirror() { testCrsMatrixHostMirror(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_SortCrs.hpp b/unit_test/sparse/Test_Sparse_SortCrs.hpp new file mode 100644 index 0000000000..a4d30b40a1 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_SortCrs.hpp @@ -0,0 +1,310 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Sparse_SortCrs.hpp +/// \brief Tests for sort_crs_matrix and sort_crs_graph in +/// KokkosSparse_SortCrs.hpp + +#ifndef KOKKOSSPARSE_SORTCRSTEST_HPP +#define KOKKOSSPARSE_SORTCRSTEST_HPP + +#include +#include +#include +#include "KokkosSparse_IOUtils.hpp" +#include +#include +#include +#include +#include +#include + +template +void testSortCRS(default_lno_t numRows, default_lno_t numCols, + default_size_type nnz, bool doValues, bool doStructInterface) { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + // Create a random matrix on device + // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this + // wouldn't test anything + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + numRows, numCols, nnz, 2, numCols / 2); + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + Kokkos::View rowmapHost("rowmap host", + numRows + 1); + Kokkos::View entriesHost("sorted entries host", + nnz); + Kokkos::View valuesHost("sorted values host", + nnz); + Kokkos::deep_copy(rowmapHost, rowmap); + Kokkos::deep_copy(entriesHost, entries); + Kokkos::deep_copy(valuesHost, values); + struct ColValue { + ColValue() {} + ColValue(lno_t c, scalar_t v) : col(c), val(v) {} + bool operator<(const ColValue& rhs) const { return col < rhs.col; } + bool operator==(const ColValue& rhs) const { + return col == rhs.col && val == rhs.val; + } + lno_t col; + scalar_t val; + }; + // sort one row at a time on host using STL. + { + for (lno_t i = 0; i < numRows; i++) { + std::vector rowCopy; + for (size_type j = rowmapHost(i); j < rowmapHost(i + 1); j++) + rowCopy.emplace_back(entriesHost(j), valuesHost(j)); + std::sort(rowCopy.begin(), rowCopy.end()); + // write sorted row back + for (size_t j = 0; j < rowCopy.size(); j++) { + entriesHost(rowmapHost(i) + j) = rowCopy[j].col; + valuesHost(rowmapHost(i) + j) = rowCopy[j].val; + } + } + } + // call the actual sort routine being tested + if (doValues) { + if (doStructInterface) { + KokkosSparse::sort_crs_matrix(A); + } else { + KokkosSparse::sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); + } + } else { + if (doStructInterface) { + KokkosSparse::sort_crs_graph(A.graph); + } else { + KokkosSparse::sort_crs_graph( + A.graph.row_map, A.graph.entries); + } + } + // Copy to host and compare + Kokkos::View entriesOut("sorted entries host", + nnz); + Kokkos::View valuesOut("sorted values host", + nnz); + Kokkos::deep_copy(entriesOut, entries); + Kokkos::deep_copy(valuesOut, values); + for (size_type i = 0; i < nnz; i++) { + EXPECT_EQ(entriesHost(i), entriesOut(i)) + << "Sorted column indices are wrong!"; + if (doValues) { + EXPECT_EQ(valuesHost(i), valuesOut(i)) << "Sorted values are wrong!"; + } + } +} + +template +void testSortCRSUnmanaged(bool doValues, bool doStructInterface) { + // This test is about bug #960. + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix, + size_type>; + using crsMat_Managed_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + const lno_t numRows = 50; + const lno_t numCols = numRows; + size_type nnz = numRows * 5; + // Create a random matrix on device + // IMPORTANT: kk_generate_sparse_matrix does not sort the rows, if it did this + // wouldn't test anything + crsMat_Managed_t A_managed = + KokkosSparse::Impl::kk_generate_sparse_matrix( + numRows, numCols, nnz, 2, numCols / 2); + crsMat_t A(A_managed); + auto rowmap = A.graph.row_map; + auto entries = A.graph.entries; + auto values = A.values; + if (doValues) { + if (doStructInterface) { + KokkosSparse::sort_crs_matrix(A); + } else { + KokkosSparse::sort_crs_matrix( + A.graph.row_map, A.graph.entries, A.values); + } + } else { + if (doStructInterface) { + KokkosSparse::sort_crs_graph(A.graph); + } else { + KokkosSparse::sort_crs_graph( + A.graph.row_map, A.graph.entries); + } + } +} + +template +void testSortAndMerge() { + using size_type = default_size_type; + using lno_t = default_lno_t; + using scalar_t = default_scalar; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = + KokkosSparse::CrsMatrix; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type; + using values_t = typename crsMat_t::values_type; + using Kokkos::HostSpace; + using Kokkos::MemoryTraits; + using Kokkos::Unmanaged; + // Create a small CRS matrix on host + std::vector inRowmap = {0, 4, 4, 5, 7, 10}; + std::vector inEntries = { + 4, 3, 5, 3, // row 0 + // row 1 has no entries + 6, // row 2 + 2, 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + std::vector inValues = { + 1.5, 4, 1, -3, // row 0 + // row 1 + 2, // row 2 + -1, -2, // row 3 + 0, 3.5, -2.25 // row 4 + }; + lno_t nrows = 5; + lno_t ncols = 7; + size_type nnz = inEntries.size(); + Kokkos::View> hostInRowmap( + inRowmap.data(), nrows + 1); + Kokkos::View> hostInEntries( + inEntries.data(), nnz); + Kokkos::View> hostInValues( + inValues.data(), nnz); + rowmap_t devInRowmap("", nrows + 1); + entries_t devInEntries("", nnz); + values_t devInValues("", nnz); + Kokkos::deep_copy(devInRowmap, hostInRowmap); + Kokkos::deep_copy(devInEntries, hostInEntries); + Kokkos::deep_copy(devInValues, hostInValues); + crsMat_t input("Input", nrows, ncols, nnz, devInValues, devInRowmap, + devInEntries); + crsMat_t output = KokkosSparse::sort_and_merge_matrix(input); + exec_space().fence(); + EXPECT_EQ(output.numRows(), nrows); + EXPECT_EQ(output.numCols(), ncols); + auto outRowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + output.graph.row_map); + auto outEntries = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), + output.graph.entries); + auto outValues = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), output.values); + // Expect 2 merges to have taken place + std::vector goldRowmap = {0, 3, 3, 4, 5, 8}; + std::vector goldEntries = { + 3, 4, 5, // row 0 + // row 1 has no entries + 6, // row 2 + 2, // row 3 + 0, 1, 2 // row 4 + }; + // note: choosing values that can be represented exactly by float + std::vector goldValues = { + 1, 1.5, 1, // row 0 + // row 1 + 2, // row 2 + -3, // row 3 + 0, 3.5, -2.25 // row 4 + }; + EXPECT_EQ(goldRowmap.size(), outRowmap.extent(0)); + EXPECT_EQ(goldEntries.size(), outEntries.extent(0)); + EXPECT_EQ(goldValues.size(), outValues.extent(0)); + EXPECT_EQ(goldValues.size(), output.nnz()); + for (lno_t i = 0; i < nrows + 1; i++) EXPECT_EQ(goldRowmap[i], outRowmap(i)); + for (size_type i = 0; i < output.nnz(); i++) { + EXPECT_EQ(goldEntries[i], outEntries(i)); + EXPECT_EQ(goldValues[i], outValues(i)); + } +} + +TEST_F(TestCategory, common_sort_crsgraph) { + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + testSortCRS(10, 10, 20, false, doStructInterface); + testSortCRS(100, 100, 2000, false, doStructInterface); + testSortCRS(1000, 1000, 30000, false, doStructInterface); + testSortCRSUnmanaged(false, doStructInterface); + } +} + +TEST_F(TestCategory, common_sort_crsmatrix) { + for (int doStructInterface = 0; doStructInterface < 2; doStructInterface++) { + testSortCRS(10, 10, 20, true, doStructInterface); + testSortCRS(100, 100, 2000, true, doStructInterface); + testSortCRS(1000, 1000, 30000, true, doStructInterface); + testSortCRSUnmanaged(true, doStructInterface); + } +} + +TEST_F(TestCategory, common_sort_crs_longrows) { + testSortCRS(1, 50000, 10000, false, false); + testSortCRS(1, 50000, 10000, true, false); +} + +TEST_F(TestCategory, common_sort_merge_crsmatrix) { + testSortAndMerge(); +} + +#endif // KOKKOSSPARSE_SORTCRSTEST_HPP diff --git a/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp new file mode 100644 index 0000000000..fc33f9f08b --- /dev/null +++ b/unit_test/sparse/Test_Sparse_TestUtils_RandCscMat.hpp @@ -0,0 +1,105 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void doCscMat(size_t m, size_t n, ScalarType min_val, ScalarType max_val) { + auto expected_min = ScalarType(1.0); + int64_t expected_nnz = 0; + RandCscMat cm(m, n, min_val, max_val); + + for (int64_t i = 0; i < cm.get_nnz(); ++i) + ASSERT_GE(cm(i), expected_min) << cm.info; + + for (int64_t j = 0; j < cm.get_n(); ++j) { + for (int64_t i = 0; i < cm.get_col_len(j); ++i) + ASSERT_FLOAT_EQ(cm(cm.get_col_start(j) + i), cm(expected_nnz + i)) + << cm.info; + expected_nnz += cm.get_col_len(j); + } + ASSERT_EQ(cm.get_nnz(), expected_nnz) << cm.info; + + // No need to check data here. Kokkos unit-tests deep_copy. + auto vals = cm.get_vals(); + ASSERT_EQ(vals.extent(0), cm.get_nnz() + 1) << cm.info; + + auto row_ids = cm.get_row_ids(); + ASSERT_EQ(row_ids.extent(0), cm.get_n() * cm.get_m() + 1) << cm.info; + + auto col_map = cm.get_col_map(); + ASSERT_EQ(col_map.extent(0), cm.get_n() + 1); +} + +template +void doAllCscMat(size_t m, size_t n) { + int min = 1, max = 10; + + // Verify that CscMax is constructed properly. + doCscMat(m, n, min, max); + doCscMat(m, n, min, max); + + doCscMat(m, n, min, max); + doCscMat(m, n, min, max); + + // Verify that CscMax can be instantiated with complex types. + RandCscMat, Kokkos::LayoutLeft, ExeSpaceType> cmcf( + m, n, min, max); + RandCscMat, Kokkos::LayoutRight, ExeSpaceType> cmcd( + m, n, min, max); +} + +// Test randomly generated csc matrices +TEST_F(TestCategory, sparse_randcscmat) { + // Square cases + for (int dim = 1; dim < 1024; dim *= 4) doAllCscMat(dim, dim); + + // Non-square cases + for (int dim = 1; dim < 1024; dim *= 4) { + doAllCscMat(dim * 3, dim); + doAllCscMat(dim, dim * 3); + } +} +} // namespace Test \ No newline at end of file diff --git a/unit_test/sparse/Test_Sparse_Transpose.hpp b/unit_test/sparse/Test_Sparse_Transpose.hpp new file mode 100644 index 0000000000..77868a7251 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_Transpose.hpp @@ -0,0 +1,357 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +/// \file Test_Common_Transpose.hpp + +#ifndef KOKKOSKERNELS_TRANSPOSE_HPP +#define KOKKOSKERNELS_TRANSPOSE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +template +struct ExactCompare { + ExactCompare(const V& v1_, const V& v2_) : v1(v1_), v2(v2_) {} + + KOKKOS_INLINE_FUNCTION void operator()(size_type i, size_type& ldiffs) const { + if (v1(i) != v2(i)) ldiffs++; + } + + V v1; + V v2; +}; + +template +void testTranspose(int numRows, int numCols, bool doValues) { + using range_pol = Kokkos::RangePolicy; + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using c_rowmap_t = typename crsMat_t::row_map_type; + using c_entries_t = typename crsMat_t::index_type; + using c_values_t = typename crsMat_t::values_type; + using rowmap_t = typename crsMat_t::row_map_type::non_const_type; + using entries_t = typename crsMat_t::index_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + size_type nnz = 10 * numRows; + // Generate a matrix that has 0 entries in some rows + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( + numRows, numCols, nnz, 3 * 10, numRows / 2); + // compute the transpose while unsorted, then transpose again + rowmap_t t_rowmap("Rowmap^T", numCols + 1); // this view is initialized to 0 + entries_t t_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"), + input_mat.graph.entries.extent(0)); + values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + input_mat.values.extent(0)); + rowmap_t tt_rowmap("Rowmap^T^T", + numRows + 1); // this view is initialized to 0 + entries_t tt_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"), + input_mat.graph.entries.extent(0)); + values_t tt_values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + input_mat.values.extent(0)); + if (doValues) { + KokkosSparse::Impl::transpose_matrix( + numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, + input_mat.values, t_rowmap, t_entries, t_values); + KokkosSparse::Impl::transpose_matrix( + numCols, numRows, t_rowmap, t_entries, t_values, tt_rowmap, tt_entries, + tt_values); + } else { + KokkosSparse::Impl::transpose_graph( + numRows, numCols, input_mat.graph.row_map, input_mat.graph.entries, + t_rowmap, t_entries); + KokkosSparse::Impl::transpose_graph( + numCols, numRows, t_rowmap, t_entries, tt_rowmap, tt_entries); + } + // Sort both the transpose-transpose, and the original matrix (to compare + // directly) + KokkosSparse::sort_crs_matrix(input_mat); + KokkosSparse::sort_crs_matrix( + tt_rowmap, tt_entries, tt_values); + // The views should now be exactly identical, since they represent the same + // matrix and are sorted + size_type rowmapDiffs; + Kokkos::parallel_reduce( + range_pol(0, numRows + 1), + ExactCompare(input_mat.graph.row_map, tt_rowmap), + rowmapDiffs); + size_type entriesDiffs; + Kokkos::parallel_reduce( + range_pol(0, input_mat.nnz()), + ExactCompare(input_mat.graph.entries, tt_entries), + entriesDiffs); + EXPECT_EQ(size_type(0), rowmapDiffs); + EXPECT_EQ(size_type(0), entriesDiffs); + if (doValues) { + size_type valuesDiffs; + Kokkos::parallel_reduce( + range_pol(0, input_mat.nnz()), + ExactCompare(input_mat.values, tt_values), + valuesDiffs); + EXPECT_EQ(size_type(0), valuesDiffs); + } +} + +template +void CompareBsrMatrices(bsrMat_t& A, bsrMat_t& B) { + using exec_space = typename bsrMat_t::execution_space; + using range_pol = Kokkos::RangePolicy; + using size_type = default_size_type; + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + // The views should now be exactly identical, since they represent the same + // matrix and are sorted + + size_type rowmapDiffs; + Kokkos::parallel_reduce( + range_pol(0, A.numRows() + 1), + ExactCompare(A.graph.row_map, B.graph.row_map), + rowmapDiffs); + + size_type entriesDiffs; + Kokkos::parallel_reduce( + range_pol(0, A.nnz()), + ExactCompare(A.graph.entries, B.graph.entries), + entriesDiffs); + + EXPECT_EQ(size_type(0), rowmapDiffs); + EXPECT_EQ(size_type(0), entriesDiffs); + + size_type valuesDiffs; + Kokkos::parallel_reduce(range_pol(0, A.nnz() * A.blockDim() * A.blockDim()), + ExactCompare(A.values, B.values), + valuesDiffs); + EXPECT_EQ(size_type(0), valuesDiffs); +} + +template +void testTransposeBsrRef() { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using bsrMat_t = + typename KokkosSparse::Experimental::BsrMatrix; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + const int numRows = 4; + const int nnz = 7; + const int block_size = 2; + + // Coming up with a BsrMatrix + bsrMat_t A; + { + rowmap_t row_map("row map", numRows + 1); + entries_t entries("entries", nnz); + values_t values("values", nnz * block_size * block_size); + + const size_type row_mapPtr[] = {0, 2, 3, 5, 7}; + const lno_t entriesPtr[] = {2, 3, 1, 0, 1, 1, 3}; + const scalar_t valuesPtr[] = { + 0.0, 0.1, 0.2, 0.3, 1.0, 1.1, 1.2, 1.3, 2.0, 2.1, 2.2, 2.3, 3.0, 3.1, + 3.2, 3.3, 4.0, 4.1, 4.2, 4.3, 5.0, 5.1, 5.2, 5.3, 6.0, 6.1, 6.2, 6.3}; + + typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr, + numRows + 1); + typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz); + typename values_t::HostMirror::const_type values_h( + valuesPtr, nnz * block_size * block_size); + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + + A = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries, + block_size); + } + + // Constructing the transpose of A manually + bsrMat_t At_ref; + { + rowmap_t row_map("row map", numRows + 1); + entries_t entries("entries", nnz); + values_t values("values", nnz * block_size * block_size); + + const size_type row_mapPtr[] = {0, 1, 4, 5, 7}; + const lno_t entriesPtr[] = {2, 1, 2, 3, 0, 0, 3}; + const scalar_t valuesPtr[] = { + 3.0, 3.2, 3.1, 3.3, 2.0, 2.2, 2.1, 2.3, 4.0, 4.2, 4.1, 4.3, 5.0, 5.2, + 5.1, 5.3, 0.0, 0.2, 0.1, 0.3, 1.0, 1.2, 1.1, 1.3, 6.0, 6.2, 6.1, 6.3}; + + typename rowmap_t::HostMirror::const_type row_map_h(row_mapPtr, + numRows + 1); + typename entries_t::HostMirror::const_type entries_h(entriesPtr, nnz); + typename values_t::HostMirror::const_type values_h( + valuesPtr, nnz * block_size * block_size); + + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + + At_ref = bsrMat_t("A", numRows, numRows, nnz, values, row_map, entries, + block_size); + } + + bsrMat_t At = KokkosSparse::Impl::transpose_bsr_matrix(A); + KokkosSparse::sort_bsr_matrix(At); + + CompareBsrMatrices(At, At_ref); +} + +template +void testTransposeBsr(int numRows, int numCols, int blockSize) { + using scalar_t = default_scalar; + using lno_t = default_lno_t; + using size_type = default_size_type; + using mem_space = typename exec_space::memory_space; + using device_t = Kokkos::Device; + using bsrMat_t = + typename KokkosSparse::Experimental::BsrMatrix; + using c_rowmap_t = typename bsrMat_t::row_map_type; + using c_entries_t = typename bsrMat_t::index_type; + using c_values_t = typename bsrMat_t::values_type; + using rowmap_t = typename bsrMat_t::row_map_type::non_const_type; + using entries_t = typename bsrMat_t::index_type::non_const_type; + using values_t = typename bsrMat_t::values_type::non_const_type; + + // Generate a matrix that has 0 entries in some rows + size_type nnz = 10 * numRows; + bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + blockSize, numRows, numCols, nnz, 3, numRows / 4); + + // compute the transpose while unsorted, then transpose again + rowmap_t t_rowmap("Rowmap^T", numCols + 1); // this view is initialized to 0 + entries_t t_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T"), + A.graph.entries.extent(0)); + values_t t_values(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + A.values.extent(0)); + rowmap_t tt_rowmap("Rowmap^T^T", + numRows + 1); // this view is initialized to 0 + entries_t tt_entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries^T^T"), + A.graph.entries.extent(0)); + values_t tt_values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "Values^T"), + A.values.extent(0)); + + KokkosSparse::Impl::transpose_bsr_matrix( + numRows, numCols, blockSize, A.graph.row_map, A.graph.entries, A.values, + t_rowmap, t_entries, t_values); + + KokkosSparse::Impl::transpose_bsr_matrix< + rowmap_t, entries_t, values_t, rowmap_t, entries_t, values_t, exec_space>( + numCols, numRows, blockSize, t_rowmap, t_entries, t_values, tt_rowmap, + tt_entries, tt_values); + bsrMat_t Att("Att", numRows, numCols, nnz, tt_values, tt_rowmap, tt_entries, + blockSize); + + // Sort both the transpose-transpose, and the original matrix (to compare + // directly) + KokkosSparse::sort_bsr_matrix(A); + + KokkosSparse::sort_bsr_matrix(Att); + + CompareBsrMatrices(A, Att); +} + +TEST_F(TestCategory, sparse_transpose_matrix) { + // Test both matrix and graph transpose with various sizes + testTranspose(100, 100, true); + testTranspose(500, 50, true); + testTranspose(50, 500, true); + testTranspose(4000, 2000, true); + testTranspose(2000, 4000, true); + testTranspose(2000, 2000, true); +} + +TEST_F(TestCategory, sparse_transpose_graph) { + testTranspose(100, 100, false); + testTranspose(500, 50, false); + testTranspose(50, 500, false); + testTranspose(4000, 2000, false); + testTranspose(2000, 4000, false); + testTranspose(2000, 2000, false); +} + +TEST_F(TestCategory, sparse_transpose_bsr_matrix) { + testTransposeBsrRef(); + // Test bsrMatrix transpose with various sizes + testTransposeBsr(100, 100, 3); + testTransposeBsr(500, 50, 5); + testTransposeBsr(50, 500, 16); + testTransposeBsr(4000, 2000, 3); + testTransposeBsr(2000, 4000, 3); + testTransposeBsr(2000, 2000, 5); +} + +#endif diff --git a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp index 3d85ec394a..0ad16c54d0 100644 --- a/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp +++ b/unit_test/sparse/Test_Sparse_Utils_cusparse.hpp @@ -7,7 +7,7 @@ #include #include -#include "KokkosKernels_SparseUtils_cusparse.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" void test_cusparse_safe_call() { bool caught_exception = false; diff --git a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp index d76f6be812..51e0899529 100644 --- a/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_block_gauss_seidel.hpp @@ -48,7 +48,8 @@ #include "KokkosKernels_TestUtils.hpp" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include #include #include @@ -58,19 +59,10 @@ #include #include "KokkosSparse_gauss_seidel.hpp" -// #ifndef kokkos_complex_double -// #define kokkos_complex_double Kokkos::complex -// #define kokkos_complex_float Kokkos::complex -// #endif +using kokkos_complex_double = Kokkos::complex; +using kokkos_complex_float = Kokkos::complex; -typedef Kokkos::complex kokkos_complex_double; -typedef Kokkos::complex kokkos_complex_float; - -using namespace KokkosKernels; -using namespace KokkosKernels::Impl; -using namespace KokkosKernels::Experimental; -using namespace KokkosSparse; -using namespace KokkosSparse::Experimental; +namespace KSExp = KokkosSparse::Experimental; namespace Test { @@ -90,8 +82,9 @@ struct GSTestParams { // Note: GS_DEFAULT is same as GS_TEAM and - for blocks - as GS_PERMUTED // Note: GS_TWOSTAGE and GS_CLUSTER are not supported for blocks - std::vector gs_algorithms = {GS_DEFAULT}; - std::vector shmem_sizes = { + std::vector gs_algorithms = { + KokkosSparse::GS_DEFAULT}; + std::vector shmem_sizes = { 32128, 2008 // make the shmem small on gpus so that it will test 2 level // algorithm. @@ -119,12 +112,11 @@ int run_block_gauss_seidel_1( typedef typename lno_nnz_view_t::value_type lno_t; typedef typename scalar_view_t::value_type scalar_t; - constexpr auto format = MatrixTraits::format; + constexpr auto format = KokkosSparse::Impl::MatrixTraits::format; - typedef KokkosKernelsHandle< + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename mtx_t::execution_space, - typename mtx_t::memory_space, typename mtx_t::memory_space> - KernelHandle; + typename mtx_t::memory_space, typename mtx_t::memory_space>; KernelHandle kh; kh.set_team_work_size(16); kh.set_shmem_size(shmem_size); @@ -136,33 +128,33 @@ int run_block_gauss_seidel_1( const int apply_count = 100; if (!skip_symbolic) { - block_gauss_seidel_symbolic(&kh, num_rows_1, num_cols_1, block_size, - input_mat.graph.row_map, - input_mat.graph.entries, is_symmetric_graph); + KSExp::block_gauss_seidel_symbolic( + &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, + input_mat.graph.entries, is_symmetric_graph); } if (!skip_numeric) { - block_gauss_seidel_numeric( + KSExp::block_gauss_seidel_numeric( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, is_symmetric_graph); } switch (apply_type) { case Test::forward_sweep: - forward_sweep_block_gauss_seidel_apply( + KSExp::forward_sweep_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); break; case Test::backward_sweep: - backward_sweep_block_gauss_seidel_apply( + KSExp::backward_sweep_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); break; case Test::symmetric: default: - symmetric_block_gauss_seidel_apply( + KSExp::symmetric_block_gauss_seidel_apply( &kh, num_rows_1, num_cols_1, block_size, input_mat.graph.row_map, input_mat.graph.entries, input_mat.values, x_vector, y_vector, false, true, omega, apply_count); @@ -175,15 +167,15 @@ int run_block_gauss_seidel_1( } // namespace Test -template +template void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -200,7 +192,7 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t block_size = params.block_size; crsMat_t crsmat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); lno_view_t pf_rm; @@ -211,16 +203,15 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2); // this converts the previous generated matrix to block matrix. - auto input_mat = - MatrixConverter::from_blockcrs_formated_point_crsmatrix( - crsmat2, block_size); + auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix( + crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -262,15 +253,15 @@ void test_block_gauss_seidel_rank1(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template +template void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance) { using namespace Test; srand(245); - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using MatrixConverter = KokkosSparse::Impl::MatrixConverter; typedef typename device::execution_space exec_space; typedef typename crsMat_t::StaticCrsGraphType graph_t; @@ -288,7 +279,7 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t block_size = params.block_size; crsMat_t crsmat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); lno_view_t pf_rm; @@ -299,15 +290,14 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // this makes consecutive 5 rows to have same columns. // it will add scalar 0's for those entries that does not exists. // the result is still a point crs matrix. - KokkosKernels::Impl::kk_create_blockcrs_formated_point_crsmatrix( + KokkosSparse::Impl::kk_create_blockcrs_formatted_point_crsmatrix( block_size, crsmat.numRows(), crsmat.numCols(), crsmat.graph.row_map, crsmat.graph.entries, crsmat.values, out_r, out_c, pf_rm, pf_e, pf_v); graph_t static_graph2(pf_e, pf_rm); crsMat_t crsmat2("CrsMatrix2", out_c, pf_v, static_graph2); - auto input_mat = - MatrixConverter::from_blockcrs_formated_point_crsmatrix( - crsmat2, block_size); + auto input_mat = MatrixConverter::from_blockcrs_formatted_point_crsmatrix( + crsmat2, block_size); lno_t nv = ((crsmat2.numRows() + block_size - 1) / block_size) * block_size; @@ -372,8 +362,8 @@ void test_block_gauss_seidel_rank2(lno_t numRows, size_type nnz, // device::execution_space::finalize(); } -template +template void test_block_gauss_seidel_empty() { using namespace Test; typedef @@ -383,10 +373,9 @@ void test_block_gauss_seidel_empty() { typedef typename graph_t::row_map_type::non_const_type row_map_type; typedef typename graph_t::entries_type::non_const_type entries_type; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef KokkosKernelsHandle< + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< size_type, lno_t, scalar_t, typename device::execution_space, - typename device::memory_space, typename device::memory_space> - KernelHandle; + typename device::memory_space, typename device::memory_space>; // The rowmap of a zero-row matrix can be length 0 or 1, so Gauss-Seidel // should work with both (the setup and apply are essentially no-ops but they // shouldn't crash or throw exceptions) For this test, create size-0 and @@ -394,7 +383,7 @@ void test_block_gauss_seidel_empty() { // which can trigger different bugs. for (const int rowmapLen : {0, 1, 5}) { KernelHandle kh; - kh.create_gs_handle(GS_DEFAULT); + kh.create_gs_handle(KokkosSparse::GS_DEFAULT); const auto num_rows = KOKKOSKERNELS_MACRO_MAX(0, rowmapLen - 1); const lno_t block_size = 1; // irrelevant (no values here) // initialized to 0 @@ -402,183 +391,58 @@ void test_block_gauss_seidel_empty() { entries_type entries("Entries", 0); scalar_view_t values("Values", 0); // also, make sure graph symmetrization doesn't crash on zero rows - block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, rowmap, - entries, false); - block_gauss_seidel_numeric(&kh, num_rows, num_rows, block_size, - rowmap, entries, values, false); + KSExp::block_gauss_seidel_symbolic(&kh, num_rows, num_rows, block_size, + rowmap, entries, false); + KSExp::block_gauss_seidel_numeric( + &kh, num_rows, num_rows, block_size, rowmap, entries, values, false); scalar_view_t x("X", num_rows); scalar_view_t y("Y", num_rows); scalar_t omega(0.9); - symmetric_block_gauss_seidel_apply( + KSExp::symmetric_block_gauss_seidel_apply( &kh, num_rows, num_rows, block_size, rowmap, entries, values, x, y, false, true, omega, 3); kh.destroy_gs_handle(); } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank1(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank2(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_blockcrs_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ + test_block_gauss_seidel_empty(); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank1_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank1( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank1(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_rank2_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_rank2( \ - 500, 500 * 10, 70, 3); \ + test_block_gauss_seidel_rank2(500, 500 * 10, 70, 3); \ } \ TEST_F( \ TestCategory, \ sparse_bsr_gauss_seidel_empty_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_block_gauss_seidel_empty(); \ + test_block_gauss_seidel_empty(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_bspgemm.hpp b/unit_test/sparse/Test_Sparse_bspgemm.hpp new file mode 100644 index 0000000000..7374ac6a78 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_bspgemm.hpp @@ -0,0 +1,318 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_IOUtils.hpp" + +using namespace KokkosSparse; + +namespace Test { + +template +int run_block_spgemm(const bsrMat_t A, const bsrMat_t B, bsrMat_t &C, + // parameters + KokkosSparse::SPGEMMAlgorithm spgemm_algorithm, + bool use_dynamic_scheduling = true, + size_t shmem_size = 0) { + typedef typename bsrMat_t::size_type size_type; + typedef typename bsrMat_t::ordinal_type lno_t; + typedef typename bsrMat_t::value_type scalar_t; + typedef typename bsrMat_t::device_type device; + typedef typename bsrMat_t::memory_space memory_space; + + typedef KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, typename device::execution_space, + memory_space, memory_space> + KernelHandle; + + KernelHandle kh; + kh.set_team_work_size(16); + kh.set_dynamic_scheduling(use_dynamic_scheduling); + + kh.create_spgemm_handle(spgemm_algorithm); + + if (shmem_size > 0) { + kh.set_shmem_size(shmem_size); + } + KokkosSparse::block_spgemm_symbolic(kh, A, false, B, false, C); + KokkosSparse::block_spgemm_numeric(kh, A, false, B, false, C); + kh.destroy_spgemm_handle(); + + return 0; +} + +template +bool is_same_block_matrix(bsrMat_t output_mat_actual, + bsrMat_t output_mat_reference) { + using device = typename bsrMat_t::device_type; + using graph_t = typename bsrMat_t::StaticCrsGraphType; + using lno_view_t = typename graph_t::row_map_type::non_const_type; + using lno_nnz_view_t = typename graph_t::entries_type::non_const_type; + using scalar_view_t = typename bsrMat_t::values_type::non_const_type; + + size_t nrows_actual = output_mat_actual.numRows(); + size_t nentries_actual = output_mat_actual.graph.entries.extent(0); + size_t nvals_actual = output_mat_actual.values.extent(0); + + size_t nrows_reference = output_mat_reference.numRows(); + size_t nentries_reference = output_mat_reference.graph.entries.extent(0); + size_t nvals_reference = output_mat_reference.values.extent(0); + + if (nrows_actual != nrows_reference) { + std::cout << "nrows_actual:" << nrows_actual + << " nrows_reference:" << nrows_reference << std::endl; + return false; + } + if (nentries_actual != nentries_reference) { + std::cout << "nentries_actual:" << nentries_actual + << " nentries_reference:" << nentries_reference << std::endl; + return false; + } + if (nvals_actual != nvals_reference) { + std::cout << "nvals_actual:" << nvals_actual + << " nvals_reference:" << nvals_reference << std::endl; + return false; + } + + KokkosSparse::sort_bsr_matrix(output_mat_actual); + KokkosSparse::sort_bsr_matrix(output_mat_reference); + + bool is_identical = true; + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + + if (!is_identical) { + std::cout << "rowmaps are different." << std::endl; + std::cout << "Actual rowmap:\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.row_map); + std::cout << "Correct rowmap (SPGEMM_DEBUG):\n"; + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.row_map); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + + if (!is_identical) { + std::cout << "entries are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.graph.entries); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.graph.entries); + return false; + } + + typedef typename Kokkos::Details::ArithTraits< + typename scalar_view_t::non_const_value_type>::mag_type eps_type; + eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; + + is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< + scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( + output_mat_actual.values, output_mat_reference.values, eps); + + if (!is_identical) { + std::cout << "values are different." << std::endl; + KokkosKernels::Impl::kk_print_1Dview(output_mat_actual.values); + KokkosKernels::Impl::kk_print_1Dview(output_mat_reference.values); + + return false; + } + return true; +} +} // namespace Test + +// Generate matrices and test all supported spgemm algorithms. +// C := AB, where A is m*k, B is k*n, and C is m*n. +template +void test_bspgemm(lno_t blkDim, lno_t m, lno_t k, lno_t n, size_type nnz, + lno_t bandwidth, lno_t row_size_variance, + const bool use_dynamic_scheduling = true, + const size_t shared_memory_size = 0) { + using namespace Test; + // device::execution_space::initialize(); + // device::execution_space::print_configuration(std::cout); + + using bsrMat_t = + KokkosSparse::Experimental::BsrMatrix; + + // Generate random compressed sparse row matrix. Randomly generated (non-zero) + // values are stored in a 1-D (1 rank) array. + bsrMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( + blkDim, m, k, nnz, row_size_variance, bandwidth); + bsrMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( + blkDim, k, n, nnz, row_size_variance, bandwidth); + + const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; + + bsrMat_t output_mat2; + run_block_spgemm(A, B, output_mat2, SPGEMM_DEBUG, use_dynamic_scheduling, + shared_memory_size); + + std::vector algorithms = { + SPGEMM_KK, + SPGEMM_KK_MEMORY /* alias SPGEMM_KK_MEMSPEED */, + SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */, + SPGEMM_MKL /* verify failure in case of missing build */, + }; + + if (!KokkosKernels::Impl::kk_is_gpu_exec_space< + typename device::execution_space>()) { + // SPGEMM_KK_LP is useful on CPU to cover MultiCoreTag4 functor + // (otherwise skipped) but on GPU it's same as SPGEMM_KK, so we can skip it. + algorithms.push_back(SPGEMM_KK_LP); + } + + for (auto spgemm_algorithm : algorithms) { + const uint64_t max_integer = Kokkos::ArithTraits::max(); + std::string algo = "UNKNOWN"; + bool is_expected_to_fail = false; + + switch (spgemm_algorithm) { + case SPGEMM_CUSPARSE: + // TODO: add these test failure cases for cusparse too. + algo = "SPGEMM_CUSPARSE"; +#ifndef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + is_expected_to_fail = true; +#endif + break; + + case SPGEMM_MKL: + algo = "SPGEMM_MKL"; + is_expected_to_fail = !is_empy_case; // TODO: add block MKL impl +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { + is_expected_to_fail = true; + } +#else + is_expected_to_fail = true; // fail: MKL not enabled in build +#endif + // MKL requires local ordinals to be int. + // Note: empty-array special case will NOT fail on this. + if (!std::is_same::value && !is_empy_case) { + is_expected_to_fail = true; + } + // if size_type is larger than int, mkl casts it to int. + // it will fail if casting cause overflow. + if (A.values.extent(0) > max_integer) { + is_expected_to_fail = true; + } + break; + + case SPGEMM_KK: algo = "SPGEMM_KK"; break; + case SPGEMM_KK_LP: algo = "SPGEMM_KK_LP"; break; + case SPGEMM_KK_MEMSPEED: algo = "SPGEMM_KK_MEMSPEED"; break; + case SPGEMM_KK_SPEED: algo = "SPGEMM_KK_SPEED"; break; + case SPGEMM_KK_MEMORY: algo = "SPGEMM_KK_MEMORY"; break; + default: algo = "!!! UNKNOWN ALGO !!!"; + } + + Kokkos::Timer timer1; + bsrMat_t output_mat; + + bool failed = false; + int res = 0; + try { + res = run_block_spgemm(A, B, output_mat, spgemm_algorithm, + use_dynamic_scheduling, shared_memory_size); + } catch (const char *message) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; + failed = true; + } catch (std::string message) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << message; + failed = true; + } catch (std::exception &e) { + EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what(); + failed = true; + } + EXPECT_EQ(is_expected_to_fail, failed); + + // double spgemm_time = timer1.seconds(); + + timer1.reset(); + if (!is_expected_to_fail) { + EXPECT_TRUE((res == 0)) << algo; + bool is_identical = is_same_block_matrix(output_mat, output_mat2); + EXPECT_TRUE(is_identical) << algo; + // EXPECT_TRUE( equal) << algo; + } + // std::cout << "algo:" << algo << " spgemm_time:" << spgemm_time << " + // output_check_time:" << timer1.seconds() << std::endl; + } + // device::execution_space::finalize(); +} + +// Note: Tests with shared memory specified aim to trigger specific GPU functors +// dispatched by matrix size and the available shared memory. +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F(TestCategory, \ + sparse_block_spgemm_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + auto const SHMEM_AUTO = 0; \ + auto test_case = test_bspgemm; \ + /* Trigger SPGEMM_KK_MEMORY_SPREADTEAM on GPU */ \ + test_case(2, 50, 50, 50, 2000, 50, 5, true, 16 * 1024); \ + /* Trigger SPGEMM_KK -> SPGEMM_KK_MEMORY on GPU */ \ + test_case(2, 50, 50, 50, 1000, 50, 5, false, 16 * 1024); \ + /* Trigger SPGEMM_KK_MEMORY_BIGSPREADTEAM on GPU */ \ + test_case(2, 500, 500, 500, 32000, 500, 500, true, 16 * 1024); \ + /* trigger dense dispatch in hash method */ \ + test_case(2, 2, 3, 4, 2, 2, 0, true, 16 * 1024); \ + /* zero-size handling */ \ + test_case(2, 0, 0, 0, 0, 10, 10, true, SHMEM_AUTO); \ + test_case(2, 0, 12, 5, 0, 10, 0, true, SHMEM_AUTO); \ + test_case(2, 10, 10, 0, 0, 10, 10, true, SHMEM_AUTO); \ + } + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_csc2csr.hpp b/unit_test/sparse/Test_Sparse_csc2csr.hpp new file mode 100644 index 0000000000..e7d2ad868e --- /dev/null +++ b/unit_test/sparse/Test_Sparse_csc2csr.hpp @@ -0,0 +1,164 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include "KokkosSparse_csc2csr.hpp" +#include "KokkosKernels_TestUtils.hpp" + +namespace Test { +template +void doCsc2Csr(size_t m, size_t n, ScalarType min_val, ScalarType max_val, + bool fully_sparse = false) { + RandCscMat cscMat( + m, n, min_val, max_val, fully_sparse); + constexpr int league_size = 32; + + auto csrMat = KokkosSparse::csc2csr( + cscMat.get_m(), cscMat.get_n(), cscMat.get_nnz(), cscMat.get_vals(), + cscMat.get_row_ids(), cscMat.get_col_map(), league_size); + + auto csc_row_ids_d = cscMat.get_row_ids(); + auto csc_col_map_d = cscMat.get_col_map(); + auto csc_vals_d = cscMat.get_vals(); + + using ViewTypeRowIds = decltype(csc_row_ids_d); + using ViewTypeColMap = decltype(csc_col_map_d); + using ViewTypeVals = decltype(csc_vals_d); + + // Copy to host + typename ViewTypeRowIds::HostMirror csc_row_ids = + Kokkos::create_mirror_view(csc_row_ids_d); + Kokkos::deep_copy(csc_row_ids, csc_row_ids_d); + typename ViewTypeColMap::HostMirror csc_col_map = + Kokkos::create_mirror_view(csc_col_map_d); + Kokkos::deep_copy(csc_col_map, csc_col_map_d); + typename ViewTypeVals::HostMirror csc_vals = + Kokkos::create_mirror_view(csc_vals_d); + Kokkos::deep_copy(csc_vals, csc_vals_d); + + auto csr_col_ids_d = csrMat.graph.entries; + auto csr_row_map_d = csrMat.graph.row_map; + auto csr_vals_d = csrMat.values; + + using ViewTypeCsrColIds = decltype(csr_col_ids_d); + using ViewTypeCsrRowMap = decltype(csr_row_map_d); + using ViewTypeCsrVals = decltype(csr_vals_d); + + // Copy to host + typename ViewTypeCsrColIds::HostMirror csr_col_ids = + Kokkos::create_mirror_view(csr_col_ids_d); + Kokkos::deep_copy(csr_col_ids, csr_col_ids_d); + typename ViewTypeCsrRowMap::HostMirror csr_row_map = + Kokkos::create_mirror_view(csr_row_map_d); + Kokkos::deep_copy(csr_row_map, csr_row_map_d); + typename ViewTypeCsrVals::HostMirror csr_vals = + Kokkos::create_mirror_view(csr_vals_d); + Kokkos::deep_copy(csr_vals, csr_vals_d); + + Kokkos::fence(); + + for (int j = 0; j < cscMat.get_n(); ++j) { + auto col_start = csc_col_map(j); + auto col_len = csc_col_map(j + 1) - col_start; + + for (int k = 0; k < col_len; ++k) { + auto i = col_start + k; + + auto row_start = csr_row_map(csc_row_ids(i)); + auto row_len = csr_row_map(csc_row_ids(i) + 1) - row_start; + auto row_end = row_start + row_len; + + if (row_len == 0) continue; + + // Linear search for corresponding element in csr matrix + int l = row_start; + while (l < row_end && csr_col_ids(l) != j) { + ++l; + } + + if (l == row_end) + FAIL() << "csr element at (i: " << csc_row_ids(i) << ", j: " << j + << ") not found!" << std::endl; + + ASSERT_EQ(csc_vals(i), csr_vals(l)) + << "(i: " << csc_row_ids(i) << ", j: " << j << ")" << std::endl; + } + } +} + +template +void doAllScalarsCsc2Csr(size_t m, size_t n, int min, int max) { + doCsc2Csr(m, n, min, max); + doCsc2Csr(m, n, min, max); + doCsc2Csr, LayoutType, ExeSpaceType>(m, n, min, max); + doCsc2Csr, LayoutType, ExeSpaceType>(m, n, min, max); +} + +template +void doAllLayoutsCsc2Csr(size_t m, size_t n, int min, int max) { + doAllScalarsCsc2Csr(m, n, min, max); + doAllScalarsCsc2Csr(m, n, min, max); +} + +template +void doAllCsc2csr(size_t m, size_t n) { + int min = 1, max = 10; + doAllLayoutsCsc2Csr(m, n, min, max); +} + +TEST_F(TestCategory, sparse_csc2csr) { + // Square cases + for (size_t dim = 4; dim < 1024; dim *= 4) + doAllCsc2csr(dim, dim); + + // Non-square cases + for (size_t dim = 1; dim < 1024; dim *= 4) { + doAllCsc2csr(dim * 3, dim); + doAllCsc2csr(dim, dim * 3); + } + + // Fully sparse + doCsc2Csr(5, 5, 1, 10, true); + doCsc2Csr(50, 10, 10, 100, true); +} +} // namespace Test \ No newline at end of file diff --git a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp index f255fc4fcf..627a9fc99e 100644 --- a/unit_test/sparse/Test_Sparse_gauss_seidel.hpp +++ b/unit_test/sparse/Test_Sparse_gauss_seidel.hpp @@ -47,6 +47,7 @@ #include #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_IOUtils.hpp" //#include #include #include @@ -61,7 +62,7 @@ #include "KokkosSparse_gauss_seidel.hpp" #include "KokkosSparse_partitioning_impl.hpp" #include "KokkosSparse_sor_sequential_impl.hpp" -#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_SortCrs.hpp" #include "KokkosKernels_TestUtils.hpp" // #ifndef kokkos_complex_double @@ -183,7 +184,7 @@ void test_gauss_seidel_rank1(lno_t numRows, size_type nnz, lno_t bandwidth, srand(245); lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those @@ -272,7 +273,7 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those @@ -396,7 +397,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, crsMat_t; lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); auto rowmap = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), input_mat.graph.row_map); @@ -472,7 +473,7 @@ void test_balloon_clustering(lno_t numRows, size_type nnzPerRow, srand(245); size_type nnzTotal = nnzPerRow * numRows; lno_t nnzVariance = nnzPerRow / 4; - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numRows, nnzTotal, nnzVariance, bandwidth); lno_row_view_t symRowmap; lno_nnz_view_t symEntries; @@ -609,7 +610,7 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); - input_mat = KokkosKernels::sort_and_merge_matrix(input_mat); + input_mat = KokkosSparse::sort_and_merge_matrix(input_mat); if (symmetric) { // Symmetrize on host, rather than relying on the parallel versions (those // can be tested for symmetric=false) @@ -660,11 +661,11 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { const scalar_t one = Kokkos::ArithTraits::one(); size_type nnz = nnzPerRow * numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numRows, nnz, 0, numRows / 10, 2.0 * one); input_mat = Test::symmetrize(input_mat); - input_mat = KokkosKernels::sort_and_merge_matrix(input_mat); + input_mat = KokkosSparse::sort_and_merge_matrix(input_mat); scalar_view_t solution_x( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), numRows); create_random_x_vector(solution_x); @@ -689,7 +690,7 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { EXPECT_LT(result_norm_res, 0.25 * initial_norm_res); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -743,132 +744,6 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp index dc51be7f7b..4036e7ddbd 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumInto.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumInto.hpp @@ -266,139 +266,13 @@ void test_replaceSumInto() { EXPECT_TRUE(success); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##replaceSumInto##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_replaceSumInto(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp index 1c0e279366..e5e1266e1d 100644 --- a/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp +++ b/unit_test/sparse/Test_Sparse_replaceSumIntoLonger.hpp @@ -509,7 +509,7 @@ void test_replaceSumIntoLonger() { EXPECT_TRUE(success); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##replaceSumIntoLonger##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -518,133 +518,9 @@ void test_replaceSumIntoLonger() { // FIXME SYCL: test hangs or gives "CL error -46 invalid kernel name" #ifndef KOKKOS_ENABLE_SYCL -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif -#endif - -#undef EXECUTE_TEST + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOS_ENABLE_SYCL diff --git a/unit_test/sparse/Test_Sparse_rocsparse.hpp b/unit_test/sparse/Test_Sparse_rocsparse.hpp index 27e0b1f9fd..fe1bf8e9b2 100644 --- a/unit_test/sparse/Test_Sparse_rocsparse.hpp +++ b/unit_test/sparse/Test_Sparse_rocsparse.hpp @@ -7,7 +7,7 @@ #include #include #include -#include "KokkosKernels_SparseUtils_rocsparse.hpp" +#include "KokkosSparse_Utils_rocsparse.hpp" void test_rocsparse_version() { // Print version diff --git a/unit_test/sparse/Test_Sparse_spadd.hpp b/unit_test/sparse/Test_Sparse_spadd.hpp index 01c1aad2b9..881f891837 100644 --- a/unit_test/sparse/Test_Sparse_spadd.hpp +++ b/unit_test/sparse/Test_Sparse_spadd.hpp @@ -250,7 +250,7 @@ void test_spadd_known_columns() { ASSERT_EQ(A.nnz(), C.nnz()); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##spadd_sorted_input##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -269,132 +269,6 @@ void test_spadd_known_columns() { test_spadd(50, 50, 75, 100, false); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm.hpp b/unit_test/sparse/Test_Sparse_spgemm.hpp index dd22bb90dc..f52306ef74 100644 --- a/unit_test/sparse/Test_Sparse_spgemm.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm.hpp @@ -45,9 +45,8 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" -#include "KokkosKernels_Sorting.hpp" -#include +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" #include #include @@ -58,6 +57,7 @@ #include #include +#include // This file contains the matrix for test_issue402 #include "matrixIssue402.hpp" @@ -197,8 +197,8 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { return false; } - KokkosKernels::sort_crs_matrix(output_mat_actual); - KokkosKernels::sort_crs_matrix(output_mat_reference); + KokkosSparse::sort_crs_matrix(output_mat_actual); + KokkosSparse::sort_crs_matrix(output_mat_reference); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -229,7 +229,7 @@ bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { typedef typename Kokkos::Details::ArithTraits< typename scalar_view_t::non_const_value_type>::mag_type eps_type; - eps_type eps = std::is_same::value ? 2 * 1e-3 : 1e-7; + eps_type eps = std::is_same::value ? 3.7e-3 : 1e-7; is_identical = KokkosKernels::Impl::kk_is_relatively_identical_view< scalar_view_t, scalar_view_t, eps_type, typename device::execution_space>( @@ -264,11 +264,13 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, // Generate random compressed sparse row matrix. Randomly generated (non-zero) // values are stored in a 1-D (1 rank) array. - crsMat_t A = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t A = KokkosSparse::Impl::kk_generate_sparse_matrix( m, k, nnz, row_size_variance, bandwidth); - crsMat_t B = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t B = KokkosSparse::Impl::kk_generate_sparse_matrix( k, n, nnz, row_size_variance, bandwidth); + const bool is_empy_case = m < 1 || n < 1 || k < 1 || nnz < 1; + crsMat_t output_mat2; if (oldInterface) run_spgemm_old_interface(A, B, SPGEMM_DEBUG, output_mat2); @@ -280,12 +282,12 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, SPGEMM_KK_SPEED /* alias SPGEMM_KK_DENSE */ }; -#ifdef HAVE_KOKKOSKERNELS_MKL +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL algorithms.push_back(SPGEMM_MKL); #endif for (auto spgemm_algorithm : algorithms) { - const uint64_t max_integer = 2147483647; + const uint64_t max_integer = Kokkos::ArithTraits::max(); std::string algo = "UNKNOWN"; bool is_expected_to_fail = false; @@ -299,15 +301,15 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, #endif break; - case SPGEMM_MKL: - algo = "SPGEMM_MKL"; - // MKL requires scalar to be either float or double - if (!(std::is_same::value || - std::is_same::value)) { + case SPGEMM_MKL: algo = "SPGEMM_MKL"; +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (!KokkosSparse::Impl::mkl_is_supported_value_type::value) { is_expected_to_fail = true; } - // mkl requires local ordinals to be int. - if (!(std::is_same::value)) { +#endif + // MKL requires local ordinals to be int. + // Note: empty-array special case will NOT fail on this. + if (!std::is_same::value && !is_empy_case) { is_expected_to_fail = true; } // if size_type is larger than int, mkl casts it to int. @@ -315,12 +317,6 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, if (A.values.extent(0) > max_integer) { is_expected_to_fail = true; } - - if (!(Kokkos::SpaceAccessibility< - typename Kokkos::HostSpace::execution_space, - typename device::memory_space>::accessible)) { - is_expected_to_fail = true; - } break; case SPGEMM_KK: algo = "SPGEMM_KK"; break; @@ -352,7 +348,7 @@ void test_spgemm(lno_t m, lno_t k, lno_t n, size_type nnz, lno_t bandwidth, EXPECT_TRUE(is_expected_to_fail) << algo << ": " << e.what(); failed = true; } - EXPECT_TRUE((failed == is_expected_to_fail)); + EXPECT_EQ(is_expected_to_fail, failed); // double spgemm_time = timer1.seconds(); @@ -407,7 +403,7 @@ void test_issue402() { lno_view_t Browmap("B = A^T rowmap", numRows + 1); lno_nnz_view_t Bentries("B = A^T entries", nnz); scalar_view_t Bvalues("B = A^T values", nnz); - KokkosKernels::Impl::transpose_matrix< + KokkosSparse::Impl::transpose_matrix< lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, lno_nnz_view_t, scalar_view_t, lno_view_t, typename device::execution_space>( numRows, numRows, Arowmap, Aentries, Avalues, Browmap, Bentries, Bvalues); @@ -437,7 +433,7 @@ void test_issue402() { << "KKMEM still has issue 402 bug; C=AA' is incorrect!\n"; } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spgemm##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spgemm(10000, 10000, 10000, \ @@ -458,132 +454,6 @@ void test_issue402() { // test_spgemm(50000, 50000 * 30, 100, 10); // test_spgemm(50000, 50000 * 30, 200, 10); -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp index 6f416e6f59..4ac707c249 100644 --- a/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp +++ b/unit_test/sparse/Test_Sparse_spgemm_jacobi.hpp @@ -45,9 +45,8 @@ #include #include -#include "KokkosKernels_SparseUtils.hpp" -#include "KokkosKernels_Sorting.hpp" -#include +#include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_SortCrs.hpp" #include #include @@ -58,6 +57,7 @@ #include #include +#include using namespace KokkosSparse; using namespace KokkosSparse::Experimental; @@ -154,7 +154,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { size_t nentries2 = output_mat2.graph.entries.extent(0); size_t nvals2 = output_mat2.values.extent(0); - KokkosKernels::sort_crs_matrix(output_mat1); + KokkosSparse::sort_crs_matrix(output_mat1); if (nrows1 != nrows2) { std::cout << "nrows1:" << nrows1 << " nrows2:" << nrows2 << std::endl; @@ -170,7 +170,7 @@ bool is_same_mat(crsMat_t output_mat1, crsMat_t output_mat2) { return false; } - KokkosKernels::sort_crs_matrix(output_mat2); + KokkosSparse::sort_crs_matrix(output_mat2); bool is_identical = true; is_identical = KokkosKernels::Impl::kk_is_identical_view< @@ -225,7 +225,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t numCols = numRows; crsMat_t input_mat = - KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); crsMat_t output_mat2; @@ -258,7 +258,7 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, EXPECT_TRUE(is_identical); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##spgemm_jacobi##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ @@ -266,132 +266,6 @@ void test_spgemm_jacobi(lno_t numRows, size_type nnz, lno_t bandwidth, 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_spiluk.hpp b/unit_test/sparse/Test_Sparse_spiluk.hpp index 31bd4b47ec..863bdf0808 100644 --- a/unit_test/sparse/Test_Sparse_spiluk.hpp +++ b/unit_test/sparse/Test_Sparse_spiluk.hpp @@ -45,11 +45,10 @@ #include #include -#include #include #include -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include #include "KokkosBlas1_nrm2.hpp" @@ -299,142 +298,15 @@ void test_spiluk() { Test::run_test_spiluk(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spiluk##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spiluk(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if 0 - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#endif - -#undef EXECUTE_TEST +#define NO_TEST_COMPLEX + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST +#undef NO_TEST_COMPLEX diff --git a/unit_test/sparse/Test_Sparse_spmv.hpp b/unit_test/sparse/Test_Sparse_spmv.hpp index 55c608a11e..8a15153dce 100644 --- a/unit_test/sparse/Test_Sparse_spmv.hpp +++ b/unit_test/sparse/Test_Sparse_spmv.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "KokkosKernels_Controls.hpp" @@ -22,6 +23,32 @@ typedef Kokkos::Experimental::half_t kokkos_half; namespace Test { +// Functor checking that the results of SPMV +// are consistent with a reference sequential +// implementation of the same operation. +// +// Inputs: +// - _ex_y the expected result calculated +// from the reference implementation +// - _y the result from optimized SPMV being +// tested for correctness +// - _eps the tolerance required to accept the +// results as correct +// - _max_val the largest possible value that can +// be stored as an intermediate result +// during the computation +// +// The criteria to assess correctness is +// abs(_ex_y - _y) / _max_val < tol +// +// Note: _max_val in the case of SPMV can be computed +// as follows. Find the max number of entries per +// row in the matrix (max_row_length), also find the +// largest value that can be stored in the matrix, x +// and y vectors (max_mat, max_x and max_y). +// +// _max_val = beta*max_y +// + alpha*max_row_length*max_mat*max_x template struct fSPMV { using value_type = int; @@ -32,21 +59,24 @@ struct fSPMV { VectorType0 expected_y; VectorType1 y; mag_type eps; + mag_type max_val; - fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps) - : expected_y(_ex_y), y(_y), eps(_eps) {} + fSPMV(const VectorType0 &_ex_y, const VectorType1 &_y, const mag_type _eps, + const mag_type _max_val = ATM::one()) + : expected_y(_ex_y), + y(_y), + eps(AT::abs(_eps)), + max_val(AT::abs(_max_val)) {} KOKKOS_INLINE_FUNCTION void operator()(const int i, value_type &err) const { - const mag_type error = - AT::abs(expected_y(i) - y(i)) / (AT::abs(expected_y(i)) > ATM::zero() - ? AT::abs(expected_y(i)) - : ATM::one()); + const mag_type error = AT::abs(expected_y(i) - y(i)); - if (error > eps) { + if (error > eps * max_val) { err++; - // printf("expected_y(%d)=%f, y(%d)=%f err=%f, eps=%f\n", i, - // AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "expected_y(%d)=%f, y(%d)=%f err=%f, max_error=%f\n", i, + AT::abs(expected_y(i)), i, AT::abs(y(i)), error, eps * max_val); } } }; @@ -113,9 +143,12 @@ void sequential_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, } template -void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, char mode) { +void check_spmv( + crsMat_t input_mat, x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, char mode, + typename Kokkos::ArithTraits::mag_type + max_val) { // typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -123,11 +156,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, using y_value_trait = Kokkos::ArithTraits; using y_value_mag_type = typename y_value_trait::mag_type; - // y is the quantity being tested here, - // so let us use y_value_type to determine - // the appropriate tolerance precision. const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + 10 * Kokkos::ArithTraits::eps(); bool transposed = (mode == 'T') || (mode == 'H'); y_vector_type expected_y( "expected", transposed ? input_mat.numCols() : input_mat.numRows()); @@ -150,7 +180,8 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); + fSPMV(expected_y, y, eps, max_val), + num_errors); if (num_errors > 0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors, y.extent_int(0), y_value_trait::abs(alpha), @@ -159,11 +190,13 @@ void check_spmv(crsMat_t input_mat, x_vector_type x, y_vector_type y, } template -void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, - y_vector_type expected_y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV, - char mode) { +void check_spmv_mv( + crsMat_t input_mat, x_vector_type x, y_vector_type y, + y_vector_type expected_y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, int numMV, char mode, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -174,7 +207,7 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, // so let us use y_value_type to determine // the appropriate tolerance precision. const y_value_mag_type eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + 10 * Kokkos::ArithTraits::eps(); Kokkos::deep_copy(expected_y, y); @@ -205,7 +238,8 @@ void check_spmv_mv(crsMat_t input_mat, x_vector_type x, y_vector_type y, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv_mv", my_exec_space(0, y_i.extent(0)), - fSPMV(y_i, y_spmv, eps), num_errors); + fSPMV(y_i, y_spmv, eps, max_val), + num_errors); if (num_errors > 0) std::cout << "KokkosSparse::Test::spmv_mv: " << num_errors << " errors of " << y_i.extent_int(0) << " for mv " << i @@ -223,7 +257,9 @@ void check_spmv_struct( structure, x_vector_type x, y_vector_type y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { + typename y_vector_type::non_const_value_type beta, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -233,9 +269,8 @@ void check_spmv_struct( // y is the quantity being tested here, // so let us use y_value_type to determine // the appropriate tolerance precision. - const double eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; - const size_t nr = input_mat.numRows(); + const double eps = Kokkos::ArithTraits::eps(); + const size_t nr = input_mat.numRows(); y_vector_type expected_y("expected", nr); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); @@ -247,13 +282,15 @@ void check_spmv_struct( int num_errors = 0; Kokkos::parallel_reduce( "KokkosKernels::UnitTests::spmv_struct", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); - if (num_errors > 0) + fSPMV(expected_y, y, eps, max_val), + num_errors); + if (num_errors > 0) { printf( "KokkosKernels::UnitTests::spmv_struct: %i errors of %i with params: " "%d %lf %lf\n", num_errors, y.extent_int(0), stencil_type, y_value_trait::abs(alpha), y_value_trait::abs(beta)); + } EXPECT_TRUE(num_errors == 0); } // check_spmv_struct @@ -265,7 +302,9 @@ void check_spmv_mv_struct( structure, x_vector_type x, y_vector_type y, y_vector_type expected_y, typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta, int numMV) { + typename y_vector_type::non_const_value_type beta, int numMV, + typename Kokkos::ArithTraits::mag_type + max_val) { using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; using y_value_type = typename y_vector_type::non_const_value_type; @@ -275,8 +314,7 @@ void check_spmv_mv_struct( // y is the quantity being tested here, // so let us use y_value_type to determine // the appropriate tolerance precision. - const double eps = - std::is_same::value ? 2 * 1e-3 : 1e-7; + const double eps = Kokkos::ArithTraits::eps(); Kokkos::deep_copy(expected_y, y); Kokkos::fence(); @@ -295,7 +333,8 @@ void check_spmv_mv_struct( Kokkos::parallel_reduce( "KokkosKernels::UnitTests::spmv_mv_struct", my_exec_space(0, y.extent(0)), - fSPMV(y_i, y_spmv, eps), num_errors); + fSPMV(y_i, y_spmv, eps, max_val), + num_errors); if (num_errors > 0) printf( "KokkosKernels::UnitTests::spmv_mv_struct: %i errors of %i with " @@ -307,10 +346,13 @@ void check_spmv_mv_struct( } // check_spmv_mv_struct template -void check_spmv_controls(KokkosKernels::Experimental::Controls controls, - crsMat_t input_mat, x_vector_type x, y_vector_type y, - typename y_vector_type::non_const_value_type alpha, - typename y_vector_type::non_const_value_type beta) { +void check_spmv_controls( + KokkosKernels::Experimental::Controls controls, crsMat_t input_mat, + x_vector_type x, y_vector_type y, + typename y_vector_type::non_const_value_type alpha, + typename y_vector_type::non_const_value_type beta, + typename Kokkos::ArithTraits::mag_type + max_val) { // typedef typename crsMat_t::StaticCrsGraphType graph_t; using ExecSpace = typename crsMat_t::execution_space; using my_exec_space = Kokkos::RangePolicy; @@ -339,7 +381,8 @@ void check_spmv_controls(KokkosKernels::Experimental::Controls controls, int num_errors = 0; Kokkos::parallel_reduce( "KokkosSparse::Test::spmv", my_exec_space(0, y.extent(0)), - fSPMV(expected_y, y, eps), num_errors); + fSPMV(expected_y, y, eps, max_val), + num_errors); if (num_errors > 0) printf("KokkosSparse::Test::spmv: %i errors of %i with params: %lf %lf\n", num_errors, y.extent_int(0), y_value_trait::abs(alpha), @@ -367,20 +410,27 @@ Kokkos::complex randomUpperBound>(int mag) { template void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(1); lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); lno_t nr = input_mat.numRows(); lno_t nc = input_mat.numCols(); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + x_vector_type input_x("x", nc); y_vector_type output_y("y", nr); x_vector_type input_xt("x", nr); @@ -389,13 +439,16 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(output_y, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(output_yt, rand_pool, randomUpperBound(max_y)); - Kokkos::fill_random(input_x, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(output_y, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(input_xt, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(output_yt, rand_pool, randomUpperBound(1)); + // We also need to bound the values + // in the matrix to bound the cancellations + // coming from arithmetic operations. + Kokkos::fill_random(input_mat.values, rand_pool, + randomUpperBound(max_val)); std::vector nonTransModes = {'N'}; std::vector transModes = {'T'}; @@ -409,14 +462,21 @@ void test_spmv(lno_t numRows, size_type nnz, lno_t bandwidth, for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { - Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode); + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + Test::check_spmv(input_mat, input_x, output_y, alpha, beta, mode, + max_error); } } } for (auto mode : transModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { - Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode); + // hoping the transpose won't have a long column... + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; + Test::check_spmv(input_mat, input_xt, output_yt, alpha, beta, mode, + max_error); } } } @@ -426,14 +486,18 @@ template void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, bool heavy, int numMV) { - lno_t numCols = numRows; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(1); - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + lno_t numCols = numRows; - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; ViewTypeX b_x("A", numRows, numMV); ViewTypeY b_y("B", numCols, numMV); @@ -445,14 +509,23 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - Kokkos::fill_random(b_x, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_y, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_xt, rand_pool, randomUpperBound(1)); - Kokkos::fill_random(b_yt, rand_pool, randomUpperBound(1)); + Kokkos::fill_random(b_x, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(b_y, rand_pool, randomUpperBound(max_y)); + Kokkos::fill_random(b_xt, rand_pool, randomUpperBound(max_x)); + Kokkos::fill_random(b_yt, rand_pool, randomUpperBound(max_y)); - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + + // We also need to bound the values + // in the matrix to bound the cancellations + // coming from arithmetic operations. + Kokkos::fill_random(input_mat.values, rand_pool, + randomUpperBound(max_val)); + Kokkos::deep_copy(b_y_copy, b_y); Kokkos::deep_copy(b_yt_copy, b_yt); @@ -468,16 +541,21 @@ void test_spmv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, for (auto mode : nonTransModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, numMV, - mode); + mode, max_error); } } } for (auto mode : transModes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + // hoping the transpose won't have a long column... + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_xt, b_yt, b_yt_copy, alpha, beta, - numMV, mode); + numMV, mode, max_error); } } } @@ -487,18 +565,24 @@ template void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, int numMV) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using ViewTypeX = Kokkos::View; + using ViewTypeY = Kokkos::View; + using mag_t = typename Kokkos::ArithTraits::mag_type; - typedef Kokkos::View ViewTypeX; - typedef Kokkos::View ViewTypeY; + constexpr mag_t max_x = static_cast(10); + constexpr mag_t max_y = static_cast(10); + constexpr mag_t max_val = static_cast(10); - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numRows, nnz, row_size_variance, bandwidth); Kokkos::Random_XorShift64_Pool rand_pool( 13718); + const lno_t max_nnz_per_row = + numRows ? (nnz / numRows + row_size_variance) : 0; + for (int nv = 1; nv <= numMV; nv++) { ViewTypeX b_x("A", numRows, nv); ViewTypeY b_y("B", numRows, nv); @@ -506,22 +590,30 @@ void test_spmv_mv_heavy(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::fill_random(b_x, rand_pool, scalar_t(10)); Kokkos::fill_random(b_y, rand_pool, scalar_t(10)); + Kokkos::fill_random(input_mat.values, rand_pool, scalar_t(10)); Kokkos::deep_copy(b_y_copy, b_y); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T'); - Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T'); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'N', + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'N', + max_y); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 1.0, nv, 'N', + max_y + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 1.0, 0.0, nv, 'T', + max_nnz_per_row * max_val * max_x); + Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, 0.0, 1.0, nv, 'T', + max_y); // Testing all modes together, since matrix is square std::vector modes = {'N', 'C', 'T', 'H'}; std::vector testAlphaBeta = {0.0, 1.0, -1.0, 2.5}; for (auto mode : modes) { for (double alpha : testAlphaBeta) { for (double beta : testAlphaBeta) { + mag_t max_error = + beta * max_y + alpha * max_nnz_per_row * max_val * max_x; Test::check_spmv_mv(input_mat, b_x, b_y, b_y_copy, alpha, beta, nv, - mode); + mode, max_error); } } } @@ -535,6 +627,11 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { using scalar_view_t = typename crsMat_t::values_type::non_const_type; using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); + constexpr mag_t max_val = static_cast(2); Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; @@ -560,26 +657,31 @@ void test_spmv_struct_1D(lno_t nx, lno_t leftBC, lno_t rightBC) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); + const mag_t max_error = max_y + 3 * max_val * max_x; - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0); - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0); - Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 0.0, + max_error); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 0.0, 1.0, + max_error); + Test::check_spmv_struct(input_mat, 1, structure, input_x, output_y, 1.0, 1.0, + max_error); } template void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, lno_t verticalBC) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 2); structure(0) = nx; @@ -615,36 +717,44 @@ void test_spmv_struct_2D(lno_t nx, lno_t ny, lno_t horizontalBC, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; - - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); - - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 1.0); - - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 1.0); + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + + { + constexpr mag_t max_val = static_cast(4); + constexpr mag_t max_error = max_y + 5 * max_val * max_x; + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 1.0, max_error); + } + + { + constexpr mag_t max_val = static_cast(8); + constexpr mag_t max_error = max_y + 9 * max_val * max_x; + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 1.0, max_error); + } } template void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, lno_t horizontal2BC, lno_t verticalBC) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef scalar_view_t x_vector_type; - typedef scalar_view_t y_vector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using x_vector_type = scalar_view_t; + using y_vector_type = scalar_view_t; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 3); structure(0) = nx; @@ -688,35 +798,43 @@ void test_spmv_struct_3D(lno_t nx, lno_t ny, lno_t nz, lno_t horizontal1BC, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_vector_type::value_type ScalarX; - typedef typename y_vector_type::value_type ScalarY; - - Kokkos::fill_random(input_x, rand_pool, ScalarX(1)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(1)); - - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, - 1.0); - - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 0.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, - 1.0); - Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, - 1.0); + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + + { + constexpr mag_t max_val = static_cast(6); + constexpr mag_t max_error = max_y + 7 * max_val * max_x; + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FD, 1, structure, input_x, output_y, 1.0, + 1.0, max_error); + } + + { + constexpr mag_t max_val = static_cast(26); + constexpr mag_t max_error = max_y + 27 * max_val * max_x; + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 0.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 0.0, + 1.0, max_error); + Test::check_spmv_struct(input_mat_FE, 2, structure, input_x, output_y, 1.0, + 1.0, max_error); + } } template void test_spmv_mv_struct_1D(lno_t nx, int numMV) { - typedef - typename KokkosSparse::CrsMatrix - crsMat_t; - typedef Kokkos::View x_multivector_type; - typedef Kokkos::View y_multivector_type; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using x_multivector_type = Kokkos::View; + using y_multivector_type = Kokkos::View; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(1); + constexpr mag_t max_y = static_cast(1); Kokkos::View structure("Spmv Structure", 1); structure(0) = nx; @@ -739,20 +857,19 @@ void test_spmv_mv_struct_1D(lno_t nx, int numMV) { Kokkos::Random_XorShift64_Pool rand_pool( 13718); - typedef typename x_multivector_type::value_type ScalarX; - typedef typename y_multivector_type::value_type ScalarY; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); - Kokkos::fill_random(input_x, rand_pool, ScalarX(10)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(10)); + constexpr mag_t max_error = 5; Kokkos::deep_copy(output_y_copy, output_y); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 1.0, 0.0, numMV); + output_y_copy, 1.0, 0.0, numMV, max_error); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 0.0, 1.0, numMV); + output_y_copy, 0.0, 1.0, numMV, max_error); Test::check_spmv_mv_struct(input_mat, 1, structure, input_x, output_y, - output_y_copy, 1.0, 1.0, numMV); + output_y_copy, 1.0, 1.0, numMV, max_error); } // check that the controls are flowing down correctly in the spmv kernel @@ -765,10 +882,15 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, using x_vector_type = scalar_view_t; using y_vector_type = scalar_view_t; using Controls = KokkosKernels::Experimental::Controls; + using mag_t = typename Kokkos::ArithTraits::mag_type; + + constexpr mag_t max_x = static_cast(10); + constexpr mag_t max_y = static_cast(10); + constexpr mag_t max_val = static_cast(10); lno_t numCols = numRows; - crsMat_t input_mat = KokkosKernels::Impl::kk_generate_sparse_matrix( + crsMat_t input_mat = KokkosSparse::Impl::kk_generate_sparse_matrix( numRows, numCols, nnz, row_size_variance, bandwidth); lno_t nr = input_mat.numRows(); lno_t nc = input_mat.numCols(); @@ -779,17 +901,20 @@ void test_spmv_controls(lno_t numRows, size_type nnz, lno_t bandwidth, Kokkos::Random_XorShift64_Pool rand_pool( 13718); - using ScalarX = typename x_vector_type::value_type; - using ScalarY = typename y_vector_type::value_type; + Kokkos::fill_random(input_x, rand_pool, max_x); + Kokkos::fill_random(output_y, rand_pool, max_y); + Kokkos::fill_random(input_mat.values, rand_pool, max_val); - Kokkos::fill_random(input_x, rand_pool, ScalarX(10)); - Kokkos::fill_random(output_y, rand_pool, ScalarY(10)); + const mag_t max_error = max_y + bandwidth * max_val * max_x; Controls controls; - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0); - Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 0.0, + max_error); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 0.0, 1.0, + max_error); + Test::check_spmv_controls(controls, input_mat, input_x, output_y, 1.0, 1.0, + max_error); } // test_spmv_controls // call it if ordinal int and, scalar float and double are instantiated. @@ -937,23 +1062,12 @@ void test_github_issue_101() { } } -#define EXECUTE_TEST_ISSUE_101(DEVICE) \ - TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ - test_github_issue_101(); \ - } - template CrsMat make_block_matrix(typename CrsMat::ordinal_type &numRows, typename CrsMat::ordinal_type &numCols, typename CrsMat::ordinal_type &blockSize) { -#if 0 - typedef typename CrsMat::StaticCrsGraphType::row_map_type::non_const_type ptr_type ; - typedef typename CrsMat::StaticCrsGraphType::entries_type::non_const_type ind_type ; - typedef typename CrsMat::values_type::non_const_type val_type ; - typedef typename CrsMat::size_type size_type; -#endif - typedef typename CrsMat::ordinal_type lno_t; - typedef typename CrsMat::value_type scalar_t; + using lno_t = typename CrsMat::ordinal_type; + using scalar_t = typename CrsMat::value_type; using Kokkos::HostSpace; using Kokkos::MemoryUnmanaged; @@ -1212,22 +1326,21 @@ template &pattern, const int m, const int n, - lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta) { + lno_t blockSize, lno_t k, y_scalar_t alpha, y_scalar_t beta, + const int max_blocks_per_row) { // get the widest passed scalar type // typedef typename std::conditional= sizeof(x_scalar_t), // a_scalar_t, x_scalar_t>::type wider_t; // typedef typename std::conditional= sizeof(y_scalar_t), // wider_t, y_scalar_t>::type widest_t; - typedef typename KokkosSparse::CrsMatrix - crs_mat_t; - typedef + using crs_mat_t = typename KokkosSparse::CrsMatrix; + using bsr_mat_t = typename KokkosSparse::Experimental::BsrMatrix - bsr_mat_t; - typedef Kokkos::View x_view_t; - typedef Kokkos::View y_view_t; + void, size_type>; + using x_view_t = Kokkos::View; + using y_view_t = Kokkos::View; using DeviceRangePolicy = Kokkos::RangePolicy; @@ -1248,23 +1361,19 @@ void test_spmv_bsrmatrix_controls_pattern( y_view_t test_y("test_y", m * blockSize, k); x_view_t test_x("test_x", n * blockSize, k); + constexpr x_scalar_t max_x = 10; + constexpr y_scalar_t max_y = 10; + constexpr a_scalar_t max_a = 10; + const double max_val = + beta * max_y + alpha * max_blocks_per_row * max_a * max_x; + // fill expected with random values Kokkos::Random_XorShift64_Pool rand_pool( 13718); Kokkos::fill_random(exp_x, rand_pool, - randomUpperBound(10)); + randomUpperBound(max_x)); Kokkos::fill_random(exp_y, rand_pool, - randomUpperBound(10)); - -#if 0 - // fill inputs with 1, for help debugging - Kokkos::parallel_for("fill", - Kokkos::MDRangePolicy>({0,0}, {hi_x.extent(0), hi_x.extent(1)}), - KOKKOS_LAMBDA (unsigned i, unsigned j) { - hi_x(i,j) = 1 + (i == 0 && j == 0); - } - ); -#endif + randomUpperBound(max_y)); // copy expected operands to test operands Kokkos::deep_copy(test_x, exp_x); @@ -1292,11 +1401,11 @@ void test_spmv_bsrmatrix_controls_pattern( // uses CUDA's half type, not Kokkos, so we still need a reduced precision // test. double eps = - KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; + 2 * KOKKOSKERNELS_IMPL_FP16_EPSILON * KOKKOSKERNELS_IMPL_FP16_RADIX; Kokkos::parallel_reduce("KokkosSparse::Test::spmv_tc", DeviceRangePolicy(0, exp_y_i.extent(0)), Test::fSPMV( - exp_y_i, test_y_i, eps), + exp_y_i, test_y_i, eps, max_val), num_errors); // explicit cast to double since no overload for half::operator<< if (num_errors > 0) @@ -1318,13 +1427,14 @@ template void test_spmv_bsrmatrix_pattern(const std::vector &pattern, const int m, const int n, lno_t blockSize, - lno_t k, y_scalar_t alpha, y_scalar_t beta) { + lno_t k, y_scalar_t alpha, y_scalar_t beta, + const int max_blocks_per_row) { { KokkosKernels::Experimental::Controls controls; controls.setParameter("algorithm", "experimental_bsr_tc"); test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta); + controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } #if defined(KOKKOS_ARCH_AMPERE) @@ -1334,7 +1444,7 @@ void test_spmv_bsrmatrix_pattern(const std::vector &pattern, controls.setParameter("tc_precision", "double"); test_spmv_bsrmatrix_controls_pattern( - controls, pattern, m, n, blockSize, k, alpha, beta); + controls, pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } #endif } @@ -1352,69 +1462,76 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, { int m = 1; int n = 1; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(0, 0)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 1x1 empty { int m = 1; int n = 1; + int max_blocks_per_row = 0; std::vector pattern = {}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x2 top-left { int m = 2; int n = 2; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(0, 0)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x2 bottom right { int m = 2; int n = 2; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(1, 1)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x3 bottom right { int m = 2; int n = 3; + int max_blocks_per_row = 1; std::vector pattern = {Coordinate(1, 2)}; test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 2x10 long bottom row { - int m = 2; - int n = 10; + int m = 2; + int n = 10; + int max_blocks_per_row = 10; std::vector pattern; for (int j = 0; j < n; ++j) { pattern.push_back(Coordinate(1, j)); } test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } // 10x10 column 1 + diagonal { - int m = 10; - int n = 10; + int m = 10; + int n = 10; + int max_blocks_per_row = 2; std::vector pattern; for (int i = 0; i < n; ++i) { pattern.push_back(Coordinate(i, 1)); @@ -1424,11 +1541,16 @@ void test_spmv_bsrmatrix(lno_t blockSize, lno_t k, y_scalar_t alpha, } test_spmv_bsrmatrix_pattern( - pattern, m, n, blockSize, k, alpha, beta); + pattern, m, n, blockSize, k, alpha, beta, max_blocks_per_row); } } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define EXECUTE_TEST_ISSUE_101(DEVICE) \ + TEST_F(TestCategory, sparse##_##spmv_issue_101##_##OFFSET##_##DEVICE) { \ + test_github_issue_101(); \ + } + +#define EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##spmv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_spmv(1000, 1000 * 3, 200, 10, true); \ @@ -1612,469 +1734,42 @@ EXECUTE_TEST_TC(double, double, double, int, size_t, LayoutRight, TestExecSpace) EXECUTE_TEST_ISSUE_101(TestExecSpace) #endif -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int, size_t, TestExecSpace) -#endif +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_FN(SCALAR, ORDINAL, OFFSET, TestExecSpace) \ + EXECUTE_TEST_STRUCT(SCALAR, ORDINAL, OFFSET, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(double, int64_t, size_t, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int, int, TestExecSpace) -#endif +#undef KOKKOSKERNELS_EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int64_t, int, TestExecSpace) -#endif -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int, size_t, TestExecSpace) -#endif +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) \ + EXECUTE_TEST_MV_STRUCT(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(float, int64_t, size_t, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif +#undef KOKKOSKERNELS_EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -EXECUTE_TEST_STRUCT(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, int, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, int, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, int, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, int, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -EXECUTE_TEST_MV_STRUCT(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif #endif // defined(KOKKOSKERNELS_INST_LAYOUTLEFT) -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace) -#endif -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace) -#endif +#include -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace) -#endif +#undef KOKKOSKERNELS_EXECUTE_TEST -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif +#endif // defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) -#undef EXECUTE_TEST +#undef EXECUTE_TEST_FN #undef EXECUTE_TEST_STRUCT #undef EXECUTE_TEST_MV #undef EXECUTE_TEST_MV_STRUCT diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp index f570a2d5df..a96af6973e 100644 --- a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -42,6 +42,7 @@ //@HEADER */ +#include #include #include #include @@ -128,36 +129,44 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - // Fill block with random values - std::vector mat_val(nnz); - for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); - // // Create graph for CrsMatrix // - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); + + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]); for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const lno_t my_row = ir * blockSize + ib; + h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize; + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { - mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = col0 * blockSize + jb; } } } } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); + // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); x_vector_type xref("new_right_hand_side", nRow); auto h_xref = Kokkos::create_mirror_view(xref); @@ -179,7 +188,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // Compute the reference product KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - y_vector_type ybcrs("bsr_product_result", nRow); + y_vector_type ybcrs("bcrs_product_result", nRow); auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir) = h_y0(ir); Kokkos::deep_copy(ybcrs, h_ybcrs); @@ -187,26 +196,27 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // Create the BlockCrsMatrix KokkosSparse::Experimental::BlockCrsMatrix - Absr(Acrs, blockSize); + Abcrs(Acrs, blockSize); // Compute the product with the BlockCrsMatrix format - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs); // Compare the two products - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybcrs, ybcrs); for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max( - error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybcrs(ir))); - maxNorm = - std::max(maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybcrs(ir))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; @@ -216,9 +226,8 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row // - const auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -231,7 +240,7 @@ void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, /// \brief Driver routine for checking BlockCrsMatrix times multiple vector template + typename layout, typename device> void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, const lno_t bMax, int &num_errors) { // The mat_structure view is used to generate a matrix using @@ -255,7 +264,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View block_vector_t; + typedef Kokkos::View block_vector_t; h_crsMat_t mat_b1 = Test::generate_structured_matrix3D("FD", mat_structure); @@ -273,41 +282,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_val(nnz); - for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); - // - // Create graph for CrsMatrix - // + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); - std::vector mat_rowmap(nRow + 1); - std::vector mat_colidx(nnz); + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); - mat_rowmap.resize(nRow + 1); - auto *rowmap = &mat_rowmap[0]; - rowmap[0] = 0; - - mat_colidx.resize(nnz); - auto *cols = &mat_colidx[0]; + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(h_matval[ii]); for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { - const lno_t my_row = ir * blockSize + ib; - rowmap[my_row + 1] = rowmap[my_row] + (jend - jbeg) * blockSize; - for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const lno_t my_row = ir * blockSize + ib; + h_rowmap[my_row + 1] = h_rowmap[my_row] + (jend - jbeg) * blockSize; + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { - cols[rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + h_colidx[h_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = col0 * blockSize + jb; } } } } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); + // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], rowmap, cols); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); block_vector_t xref("new_right_hand_side", nRow, nrhs); auto h_xref = Kokkos::create_mirror_view(xref); @@ -329,7 +337,7 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - block_vector_t ybcrs("bsr_product_result", nRow, nrhs); + block_vector_t ybcrs("bcrs_product_result", nRow, nrhs); auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); for (int jc = 0; jc < nrhs; ++jc) for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir, jc) = h_y0(ir, jc); @@ -338,38 +346,40 @@ void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // Create the BlockCrsMatrix KokkosSparse::Experimental::BlockCrsMatrix - Absr(Acrs, blockSize); + Abcrs(Acrs, blockSize); // Compute the product for the BlockCrsMatrix format - KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + KokkosSparse::spmv(fOp, alpha, Abcrs, xref, beta, ybcrs); Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybcrs, ybcrs); // Compare the two products - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + for (int jc = 0; jc < nrhs; ++jc) { for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, Kokkos::ArithTraits::abs( - h_ycrs(ir, jc) - h_ybcrs(ir, jc))); - maxNorm = std::max(maxNorm, - Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + error = std::max(error, + KATS::abs(h_ycrs(ir, jc) - h_ybcrs(ir, jc))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); } } - auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); - - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + + const mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; num_errors += 1; } + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); + if (error > tol * maxNorm) { std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -425,7 +435,7 @@ void testSpMVBlockCrsMatrix() { } template + typename layout, typename device> void testBlockCrsMatrix_SpM_MV() { // // Test for the operation Y <- alpha * Op(A) * X + beta * Y @@ -452,7 +462,7 @@ void testBlockCrsMatrix_SpM_MV() { auto alpha_s = static_cast(testAlphaBeta[ii]); auto beta_s = static_cast(testAlphaBeta[ii + 1]); num_errors = 0; - Test_BlockCrs::check_blockcrs_times_mv(&mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { @@ -469,282 +479,49 @@ void testBlockCrsMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BCRS_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testSpMVBlockCrsMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif - -#undef EXECUTE_BCRS_TIMES_VEC_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST ////////////////////////// -#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testBlockCrsMatrix_SpM_MV(); \ +#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + testBlockCrsMatrix_SpM_MV(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ + TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ + TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_BCRS_TIMES_MVEC_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index e6d3b65ac5..344a203567 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -42,6 +42,7 @@ //@HEADER */ +#include #include #include #include @@ -96,33 +97,29 @@ inline void set_random_value(std::complex &v) { /// \param mat_rowmap[out] CRS-style row map for the block matrix /// \param mat_colidx[out] CRS-style column entries for the block matrix /// \param mat_val[out] Numerical (random) values -template +template void make_block_entries( const KokkosSparse::CrsMatrix &mat_b1, - int blockSize, std::vector &mat_rowmap, - std::vector &mat_colidx, std::vector &mat_val) { - lno_t nRow = blockSize * mat_b1.numRows(); + int blockSize, rowmap_type &mat_rowmap, colidx_type &mat_colidx, + values_type &mat_val) { size_t nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - mat_val.resize(nnz); for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); // // Create graph for CrsMatrix // - mat_rowmap.assign(nRow + 1, 0); - mat_colidx.assign(nnz, 0); - for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { - const auto jbeg = mat_b1.graph.row_map(ir); - const auto jend = mat_b1.graph.row_map(ir + 1); + const size_type jbeg = mat_b1.graph.row_map(ir); + const size_type jend = mat_b1.graph.row_map(ir + 1); for (lno_t ib = 0; ib < blockSize; ++ib) { const lno_t my_row = ir * blockSize + ib; mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (auto ijk = jbeg; ijk < jend; ++ijk) { + for (size_type ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = @@ -177,17 +174,26 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); - std::vector mat_val(nnz); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); // Create the entries - make_block_entries(mat_b1, blockSize, mat_rowmap, - mat_colidx, mat_val); + make_block_entries(mat_b1, blockSize, h_rowmap, + h_colidx, h_matval); + + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); x_vector_type xref("new_right_hand_side", nRow); auto h_xref = Kokkos::create_mirror_view(xref); @@ -229,20 +235,21 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // Compare the two products // - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; + Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybsr, ybsr); for (lno_t ir = 0; ir < nRow; ++ir) { - error = std::max( - error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybsr(ir))); - maxNorm = - std::max(maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + error = std::max(error, KATS::abs(h_ycrs(ir) - h_ybsr(ir))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir))); } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; @@ -252,9 +259,8 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, // // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row // - const auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm @@ -267,7 +273,7 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, /// \brief Driver routine for checking BsrMatrix times multiple vector template + typename layout, typename device> void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, const lno_t bMax, int &num_errors) { // The mat_structure view is used to generate a matrix using @@ -291,7 +297,7 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View block_vector_t; + typedef Kokkos::View block_vector_t; h_crsMat_t mat_b1 = Test::generate_structured_matrix3D("FD", mat_structure); @@ -309,17 +315,26 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, size_type nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); - std::vector mat_rowmap(nRow + 1, 0); - std::vector mat_colidx(nnz, 0); - std::vector mat_val(nnz); + Kokkos::View d_rowmap("crsmatrix", nRow + 1); + auto h_rowmap = Kokkos::create_mirror_view(d_rowmap); + + Kokkos::View d_colidx("crsmatrix", nnz); + auto h_colidx = Kokkos::create_mirror_view(d_colidx); + + Kokkos::View d_matval("crsmatrix", nnz); + auto h_matval = Kokkos::create_mirror_view(d_matval); // Create the entries - make_block_entries(mat_b1, static_cast(blockSize), - mat_rowmap, mat_colidx, mat_val); + make_block_entries(mat_b1, blockSize, h_rowmap, + h_colidx, h_matval); + + Kokkos::deep_copy(d_matval, h_matval); + Kokkos::deep_copy(d_colidx, h_colidx); + Kokkos::deep_copy(d_rowmap, h_rowmap); // Create the CrsMatrix for the reference computation - crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], - &mat_colidx[0]); + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, d_matval, d_rowmap, + d_colidx); block_vector_t xref("new_right_hand_side", nRow, nrhs); auto h_xref = Kokkos::create_mirror_view(xref); @@ -366,29 +381,29 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, // // Compare the two products // - double error = 0.0, maxNorm = 0.0; + using KATS = Kokkos::ArithTraits; + using mag_type = typename KATS::mag_type; + + const mag_type zero_mag = Kokkos::ArithTraits::zero(); + mag_type error = zero_mag, maxNorm = zero_mag; for (int jc = 0; jc < nrhs; ++jc) { for (int ir = 0; ir < nRow; ++ir) { - error = std::max(error, Kokkos::ArithTraits::abs( - h_ycrs(ir, jc) - h_ybsr(ir, jc))); - maxNorm = std::max(maxNorm, - Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + error = std::max(error, + KATS::abs(h_ycrs(ir, jc) - h_ybsr(ir, jc))); + maxNorm = std::max(maxNorm, KATS::abs(h_ycrs(ir, jc))); } } - double tmps = - static_cast(Kokkos::ArithTraits::abs(alpha)) + - static_cast(Kokkos::ArithTraits::abs(beta)); - if ((tmps > 0.0) && (maxNorm == 0)) { + mag_type tmps = KATS::abs(alpha) + KATS::abs(beta); + if ((tmps > zero_mag) && (maxNorm == zero_mag)) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " maxNorm " << maxNorm << " error " << error << " alpha " << alpha << " beta " << beta << "\n"; num_errors += 1; } - auto tol = ((nnz / nRow) + 1) * - static_cast(Kokkos::ArithTraits::abs( - Kokkos::ArithTraits::epsilon())); + const mag_type tol = ((static_cast(nnz) / nRow) + 1) * + Kokkos::ArithTraits::epsilon(); if (error > tol * maxNorm) { std::cout << " BSR - SpMV times MV >> blockSize " << blockSize << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " @@ -531,7 +546,7 @@ void testSpMVBsrMatrix() { } template + typename layout, typename device> void testBsrMatrix_SpM_MV() { // // Test for the operation Y <- alpha * Op(A) * X + beta * Y @@ -558,7 +573,7 @@ void testBsrMatrix_SpM_MV() { auto alpha_s = static_cast(testAlphaBeta[ii]); auto beta_s = static_cast(testAlphaBeta[ii + 1]); num_errors = 0; - Test_Bsr::check_bsrm_times_mv( + Test_Bsr::check_bsrm_times_mv( &mode, alpha_s, beta_s, bMax, num_errors); if (num_errors > 0) { printf( @@ -574,281 +589,48 @@ void testBsrMatrix_SpM_MV() { ////////////////////////// -#define EXECUTE_BSR_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ testSpMVBsrMatrix(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif - -#undef EXECUTE_BSR_TIMES_VEC_TEST +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST ////////////////////////// -#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - testBsrMatrix_SpM_MV(); \ +#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ + testBsrMatrix_SpM_MV(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, - TestExecSpace) -#endif +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutLeft, \ + TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, LayoutRight, \ + TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_BSR_TIMES_MVEC_TEST diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 1be27d0c9c..c470747202 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -45,12 +45,11 @@ #include #include -#include #include #include #include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_SparseUtils.hpp" +#include "KokkosSparse_Utils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" @@ -122,7 +121,7 @@ void run_test_sptrsv_mtx() { bool is_lower_tri = true; std::cout << "Create handle" << std::endl; kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - + std::cout << "Prepare linear system" << std::endl; // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); @@ -239,7 +238,7 @@ void run_test_sptrsv_mtx() { bool is_lower_tri = false; std::cout << "Create handle" << std::endl; kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - + std::cout << "Prepare linear system" << std::endl; // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); @@ -1087,138 +1086,12 @@ void test_sptrsv() { // Test::run_test_sptrsv_mtx(); } -#define EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F(TestCategory, \ sparse##_##sptrsv##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ test_sptrsv(); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(float, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_double, int64_t, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int, size_t, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) -#endif +#include -#undef EXECUTE_TEST +#undef KOKKOSKERNELS_EXECUTE_TEST diff --git a/unit_test/sparse/Test_Sparse_trsv.hpp b/unit_test/sparse/Test_Sparse_trsv.hpp index fce73897a8..9a23f48883 100644 --- a/unit_test/sparse/Test_Sparse_trsv.hpp +++ b/unit_test/sparse/Test_Sparse_trsv.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -76,8 +77,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // this function creates a dense lower and upper triangular matrix. // TODO: SHOULD CHANGE IT TO SPARSE crsMat_t lower_part = - KokkosKernels::Impl::kk_generate_triangular_sparse_matrix( + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'L', numRows, numCols, nnz, row_size_variance, bandwidth); + + Test::shuffleMatrixEntries(lower_part.graph.row_map, lower_part.graph.entries, + lower_part.values); + KokkosSparse::spmv("N", alpha, lower_part, b_x_copy, beta, b_y); Test::check_trsv_mv(lower_part, b_x, b_y, b_x_copy, numMV, "L", "N"); @@ -86,8 +91,12 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, // typedef typename Kokkos::View indexview; crsMat_t upper_part = - KokkosKernels::Impl::kk_generate_triangular_sparse_matrix( + KokkosSparse::Impl::kk_generate_triangular_sparse_matrix( 'U', numRows, numCols, nnz, row_size_variance, bandwidth); + + Test::shuffleMatrixEntries(upper_part.graph.row_map, upper_part.graph.entries, + upper_part.values); + KokkosSparse::spmv("N", alpha, upper_part, b_x_copy, beta, b_y); Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "N"); @@ -95,309 +104,46 @@ void test_trsv_mv(lno_t numRows, size_type nnz, lno_t bandwidth, Test::check_trsv_mv(upper_part, b_x, b_y, b_x_copy, numMV, "U", "T"); } +// Note BMK 7-22: the matrix generator used by this test always +// generates a dense triangle. It ignores bandwidth, nnz and row size variance. + #define EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LAYOUT, DEVICE) \ TEST_F( \ TestCategory, \ sparse##_##trsv_mv##_##SCALAR##_##ORDINAL##_##OFFSET##_##LAYOUT##_##DEVICE) { \ test_trsv_mv( \ - 5000, 5000 * 30, 200, 10, 1); \ + 1000, 1000 * 30, 200, 10, 1); \ test_trsv_mv( \ - 5000, 5000 * 30, 100, 10, 5); \ + 800, 800 * 30, 100, 10, 5); \ test_trsv_mv( \ - 1000, 1000 * 20, 100, 5, 10); \ + 400, 400 * 20, 100, 5, 10); \ } -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutLeft, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutLeft, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ +#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(double, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(float, int64_t, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_double, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, int, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int, size_t, LayoutRight, TestExecSpace) -#endif - -#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ - defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ - defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) && \ - defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutLeft, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTLEFT + +#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -EXECUTE_TEST_MV(kokkos_complex_float, int64_t, size_t, LayoutRight, - TestExecSpace) -#endif + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + EXECUTE_TEST_MV(SCALAR, ORDINAL, OFFSET, LayoutRight, TestExecSpace) + +#include + +#undef KOKKOSKERNELS_EXECUTE_TEST + +#endif // KOKKOSKERNELS_INST_LAYOUTRIGHT #undef EXECUTE_TEST_MV