diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml new file mode 100644 index 0000000000..834add9314 --- /dev/null +++ b/.github/workflows/format.yml @@ -0,0 +1,36 @@ +name: github-FORMAT + +on: + pull_request: + branches: + - master + - develop + +jobs: + clang-format-check: + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + + - name: Install Dependencies + run: sudo apt install clang-format-8 + + - name: check + run: | + # Fetch from the default remote (origin) + git fetch &> /dev/null + + # For every file changed, apply clang-format + for file in $(git diff --name-only origin/$GITHUB_BASE_REF | egrep '.*\.cpp$|.*\.hpp$|.*\.h$'); do + if [ -e $file ]; then + clang-format-8 -i -style=file $file + git add $file + fi + done + + # If any diffs exist, error out + if [[ ! -z $(git status -s -uno . -- ':!.github') ]]; then + echo "The following files require formatting changes:" + git status -s -uno . -- ':!.github' + exit 1 + fi diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml new file mode 100644 index 0000000000..72152f749a --- /dev/null +++ b/.github/workflows/osx.yml @@ -0,0 +1,86 @@ +name: github-OSX + +on: + pull_request: + branches: + - master + - develop + +jobs: + osxci: + name: osx-ci + runs-on: [macos-latest] + + strategy: + matrix: + include: + - backend: "SERIAL" + cmake_build_type: "RelWithDebInfo" + - backend: "THREADS" + cmake_build_type: "RelWithDebInfo" + - backend: "SERIAL" + cmake_build_type: "Debug" + - backend: "SERIAL" + cmake_build_type: "Release" + + steps: + - name: checkout_kokkos_kernels + uses: actions/checkout@v2 + with: + path: kokkos-kernels + + - name: checkout_kokkos + uses: actions/checkout@v2 + with: + repository: kokkos/kokkos + ref: ${{ github.base_ref }} + path: kokkos + + - name: configure_kokkos + run: | + ls -lat + mkdir -p kokkos/{build,install} + cd kokkos/build + cmake \ + -DKokkos_ENABLE_${{ matrix.backend }}=ON \ + -DCMAKE_CXX_FLAGS="-Werror" \ + -DCMAKE_CXX_STANDARD=14 \ + -DKokkos_ENABLE_COMPILER_WARNINGS=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + .. + + - name: build_and_install_kokkos + working-directory: kokkos/build + run: make -j2 install + + - name: configure_kokkos_kernels + run: | + ls -lat + mkdir -p kokkos-kernels/{build,install} + cd kokkos-kernels/build + cmake \ + -DKokkos_DIR=$PWD/../../kokkos/install/lib/cmake/Kokkos \ + -DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }} \ + -DCMAKE_CXX_FLAGS="-Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wuninitialized" \ + -DCMAKE_INSTALL_PREFIX=$PWD/../install \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES:BOOL=ON \ + -DKokkosKernels_INST_COMPLEX_DOUBLE=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_COMPLEX_FLOAT=ON \ + -DKokkosKernels_INST_FLOAT=ON \ + -DKokkosKernels_INST_LAYOUTLEFT:BOOL=ON \ + -DKokkosKernels_INST_LAYOUTRIGHT:BOOL=ON \ + -DKokkosKernels_ENABLE_TPL_CUSPARSE=OFF \ + -DKokkosKernels_ENABLE_TPL_CUBLAS=OFF \ + .. + + - name: build_kokkos_kernels + working-directory: kokkos-kernels/build + run: make -j2 + + - name: test + working-directory: kokkos-kernels/build + run: ctest -j2 --output-on-failure \ No newline at end of file diff --git a/.jenkins/nightly.groovy b/.jenkins/nightly.groovy index e98b34001c..f30d580edc 100644 --- a/.jenkins/nightly.groovy +++ b/.jenkins/nightly.groovy @@ -1,40 +1,86 @@ pipeline { agent none + options { + timeout(time: 3, unit: 'HOURS') + } + stages { - stage('HIP-ROCm-4.2-C++14') { - agent { - dockerfile { - filename 'Dockerfile.hip' - dir 'scripts/docker' - additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.2' - label 'rocm-docker && vega' - args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + stage('Build & Run') { + parallel { + stage('SYCL-OneAPI') { + agent { + dockerfile { + filename 'Dockerfile.sycl' + dir 'scripts/docker' + label 'nvidia-docker && volta' + args '-v /tmp/ccache.kokkos:/tmp/ccache' + } + } + steps { + sh '''rm -rf kokkos && + git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DKokkos_ARCH_VOLTA70=ON \ + -DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \ + -DKokkos_ENABLE_SYCL=ON \ + -DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \ + -DCMAKE_CXX_STANDARD=17 \ + .. && \ + make -j8 && make install && \ + cd ../.. && rm -rf kokkos''' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_CXX_COMPILER=clang++ \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + .. && \ + make -j8''' + } + } + + stage('HIP-ROCm-4.5-C++14') { + agent { + dockerfile { + filename 'Dockerfile.hip' + dir 'scripts/docker' + additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.5' + label 'rocm-docker && vega' + args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES' + } + } + steps { + sh '''rm -rf kokkos && + git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ + mkdir build && cd build && \ + cmake \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DKokkos_ENABLE_HIP=ON \ + .. && \ + make -j8 && make install && \ + cd ../.. && rm -rf kokkos''' + sh '''rm -rf build && mkdir -p build && cd build && \ + cmake \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_EXTENSIONS=OFF \ + -DKokkosKernels_ENABLE_TESTS=ON \ + -DKokkosKernels_ENABLE_EXAMPLES=ON \ + -DKokkosKernels_INST_DOUBLE=ON \ + -DKokkosKernels_INST_ORDINAL_INT=ON \ + -DKokkosKernels_INST_OFFSET_INT=ON \ + .. && \ + make -j8 && ctest --verbose''' + } } - } - steps { - sh '''rm -rf kokkos && - git clone -b develop https://github.com/kokkos/kokkos.git && cd kokkos && \ - mkdir build && cd build && \ - cmake \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DKokkos_ENABLE_HIP=ON \ - .. && \ - make -j8 && make install && \ - cd ../.. && rm -rf kokkos''' - sh '''rm -rf build && mkdir -p build && cd build && \ - cmake \ - -DCMAKE_BUILD_TYPE=RelWithDebInfo \ - -DCMAKE_CXX_COMPILER=hipcc \ - -DCMAKE_CXX_EXTENSIONS=OFF \ - -DKokkosKernels_ENABLE_TESTS=ON \ - -DKokkosKernels_ENABLE_EXAMPLES=ON \ - -DKokkosKernels_INST_DOUBLE=ON \ - -DKokkosKernels_INST_ORDINAL_INT=ON \ - -DKokkosKernels_INST_OFFSET_INT=ON \ - .. && \ - make -j8 && ctest --verbose''' } } } diff --git a/CHANGELOG.md b/CHANGELOG.md index 76de9db0d0..a961701013 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,129 @@ # Change Log +## [3.6.00](https://github.com/kokkos/kokkos-kernels/tree/3.6.00) (2022-02-18) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.5.00...3.6.00) + +### Features: + +#### Batched Sparse Linear algebra +- Kokkos Kernels is adding a new component to the library: batched sparse linear algebra. +- Similarly to the current dense batched algorithms, the new algorithms are called from +- the GPU and provide Team and TeamVector level of parallelism, SpMV also provides a Serial +- call on GPU. + +- Add Batched CG and Batched GMRES [\#1155](https://github.com/kokkos/kokkos-kernels/pull/1155) +- Add Jacobi Batched preconditioner [\#1219](https://github.com/kokkos/kokkos-kernels/pull/1219) + +#### Bsr and Tensor core algorithm for sparse linear algebra +- After introducing the BsrMatrix in release 3.5.0 new algorithms are now supporting this format. +- For release 3.6.0 we are adding matrix-vector (matvec) multiplication and Gauss-Seidel as well as an +- implementation of matvec that leverages tensor cores on Nvidia GPUs. More kernels are expected to +- support the Bsr format in future releases. + +- Add Spmv for BsrMatrix [\#1255](https://github.com/kokkos/kokkos-kernels/pull/1255) +- Add BLAS to SpMV operations for BsrMatrix [\#1297](https://github.com/kokkos/kokkos-kernels/pull/1297) +- BSR format support in block Gauss-Seidel [\#1232](https://github.com/kokkos/kokkos-kernels/pull/1232) +- Experimental tensor-core SpMV for BsrMatrix [\#1090](https://github.com/kokkos/kokkos-kernels/pull/1090) + +#### Improved AMD math libraries support +- rocBLAS and rocSPARSE TPLs are now officially supported, they can be enabled at configure time. +- Initial kernels that can call rocBLAS are GEMV, GEMM, IAMAX and SCAL, while rocSPARSE can be +- called for matrix-vector multiplication. Further support for TPL calls can be requested on slack +- and by GitHub issues. + +- Tpl rocBLAS and rocSPARSE [\#1153](https://github.com/kokkos/kokkos-kernels/pull/1153) +- Add rocBLAS GEMV wrapper [\#1201](https://github.com/kokkos/kokkos-kernels/pull/1201) +- Add rocBLAS wrappers for GEMM, IAMAX, and SCAL [\#1230](https://github.com/kokkos/kokkos-kernels/pull/1230) +- SpMV: adding support for rocSPARSE TPL [\#1221](https://github.com/kokkos/kokkos-kernels/pull/1221) + +#### Additional new features +- bhalf: Unit test Batched GEMM [\#1251](https://github.com/kokkos/kokkos-kernels/pull/1251) +- and demostrate GMRES example convergence with bhalf_t (https://github.com/kokkos/kokkos-kernels/pull/1300) +- Stream interface: adding stream support in GEMV and GEMM [\#1131](https://github.com/kokkos/kokkos-kernels/pull/1131) +- Improve double buffering batched gemm performance [\#1217](https://github.com/kokkos/kokkos-kernels/pull/1217) +- Allow choosing coloring algorithm in multicolor GS [\#1199](https://github.com/kokkos/kokkos-kernels/pull/1199) +- Batched: Add armpl dgemm support [\#1256](https://github.com/kokkos/kokkos-kernels/pull/1256) + +### Deprecations: +- Deprecation warning: SpaceAccessibility move out of impl, see #1140 [\#1141](https://github.com/kokkos/kokkos-kernels/pull/1141) + +### Backends and Archs Enhancements: + +#### SYCL: +- Full Blas support on SYCL [\#1270](https://github.com/kokkos/kokkos-kernels/pull/1270) +- Get sparse tests enabled and working for SYCL [\#1269](https://github.com/kokkos/kokkos-kernels/pull/1269) +- Changes to make graph run on SYCL [\#1268](https://github.com/kokkos/kokkos-kernels/pull/1268) +- Allow querying free/total memory for SYCL [\#1225](https://github.com/kokkos/kokkos-kernels/pull/1225) +- Use KOKKOS_IMPL_DO_NOT_USE_PRINTF instead of printf in kernels [\#1162](https://github.com/kokkos/kokkos-kernels/pull/1162) + +#### HIP: +- Work around hipcc size_t/int division with remainder bug [\#1262](https://github.com/kokkos/kokkos-kernels/pull/1262) + +#### Other Improvements: +- Replace std::abs with ArithTraits::abs [\#1312](https://github.com/kokkos/kokkos-kernels/pull/1312) +- Batched/dense: Add Gemm_DblBuf LayoutLeft operator [\#1299](https://github.com/kokkos/kokkos-kernels/pull/1299) +- KokkosKernels: adding variable that returns version as a single number [\#1295](https://github.com/kokkos/kokkos-kernels/pull/1295) +- Add KOKKOSKERNELS_FORCE_SIMD macro (Fix #1040) [\#1290](https://github.com/kokkos/kokkos-kernels/pull/1290) +- Rename KOKKOS_IF_{HOST,DEVICE} -> KOKKOS_IF_ON_{HOST,DEVICE} [\#1278](https://github.com/kokkos/kokkos-kernels/pull/1278) +- Algo::Level{2,3}::Blocked::mb() [\#1265](https://github.com/kokkos/kokkos-kernels/pull/1265) +- Batched: Use SerialOpt2 for 33 to 39 square matrices [\#1261](https://github.com/kokkos/kokkos-kernels/pull/1261) +- Prune extra dependencies [\#1241](https://github.com/kokkos/kokkos-kernels/pull/1241) +- Improve double buffering batched gemm perf for matrix sizes >64x64 [\#1239](https://github.com/kokkos/kokkos-kernels/pull/1239) +- Improve graph color perf test [\#1229](https://github.com/kokkos/kokkos-kernels/pull/1229) +- Add custom implementation for strcasecmp [\#1227](https://github.com/kokkos/kokkos-kernels/pull/1227) +- Replace __restrict__ with KOKKOS_RESTRICT [\#1223](https://github.com/kokkos/kokkos-kernels/pull/1223) +- Replace array reductions in BLAS-1 MV reductions [\#1204](https://github.com/kokkos/kokkos-kernels/pull/1204) +- Update MIS-2 and aggregation [\#1143](https://github.com/kokkos/kokkos-kernels/pull/1143) +- perf_test/blas/blas3: Update SHAs for benchmarking [\#1139](https://github.com/kokkos/kokkos-kernels/pull/1139) + +### Implemented enhancements BuildSystem +- Bump ROCm version 4.2 -> 4.5 in nightly Jenkins CI build [\#1279](https://github.com/kokkos/kokkos-kernels/pull/1279) +- scripts/cm_test_all_sandia: Add A64FX ci checks [\#1276](https://github.com/kokkos/kokkos-kernels/pull/1276) +- github/workflows: Add osx CI [\#1254](https://github.com/kokkos/kokkos-kernels/pull/1254) +- Update SYCL compiler version in CI [\#1247](https://github.com/kokkos/kokkos-kernels/pull/1247) +- Do not set Kokkos variables when exporting CMake configuration [\#1236](https://github.com/kokkos/kokkos-kernels/pull/1236) +- Add nightly CI check for SYCL [\#1190](https://github.com/kokkos/kokkos-kernels/pull/1190) +- Update cmake minimum version to 3.16 [\#866](https://github.com/kokkos/kokkos-kernels/pull/866) + +### Incompatibilities: +- Kokkos::Impl: removing a few more instances of throw_runtime_exception [\#1320](https://github.com/kokkos/kokkos-kernels/pull/1320) +- Remove Kokkos::Impl::throw_runtime_exception from Kokkos Kernels [\#1294](https://github.com/kokkos/kokkos-kernels/pull/1294) +- Remove unused memory space utility [\#1283](https://github.com/kokkos/kokkos-kernels/pull/1283) +- Clean up Kokkos header includes [\#1282](https://github.com/kokkos/kokkos-kernels/pull/1282) +- Remove private Kokkos header include (Cuda/Kokkos_Cuda_Half.hpp) [\#1281](https://github.com/kokkos/kokkos-kernels/pull/1281) +- Avoid using #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_* macro guards [\#1266](https://github.com/kokkos/kokkos-kernels/pull/1266) +- Rename enumerator Impl::Exec_{PTHREADS -> THREADS} [\#1253](https://github.com/kokkos/kokkos-kernels/pull/1253) +- Remove all references to the Kokkos QThreads backend [\#1238](https://github.com/kokkos/kokkos-kernels/pull/1238) +- Replace more occurences of Kokkos::Impl::is_view [\#1234](https://github.com/kokkos/kokkos-kernels/pull/1234) +- Do not use Kokkos::Impl::is_view [\#1214](https://github.com/kokkos/kokkos-kernels/pull/1214) +- Replace Kokkos::Impl::if_c -> std::conditional [\#1213](https://github.com/kokkos/kokkos-kernels/pull/1213) + +### Bug Fixes: +- Fix bug in spmv_mv_bsrmatrix() for Ampere GPU arch [\#1315](https://github.com/kokkos/kokkos-kernels/pull/1315) +- Fix std::abs calls for rocBLAS/rocSparse [\#1310](https://github.com/kokkos/kokkos-kernels/pull/1310) +- cast literal 0 to fragment scalar type [\#1307](https://github.com/kokkos/kokkos-kernels/pull/1307) +- Fix 1303: maintain correct #cols on A in twostage [\#1304](https://github.com/kokkos/kokkos-kernels/pull/1304) +- Add dimension checking to generic spmv interface [\#1301](https://github.com/kokkos/kokkos-kernels/pull/1301) +- Add missing barriers to TeamGMRES, fix vector len [\#1285](https://github.com/kokkos/kokkos-kernels/pull/1285) +- Examples: fixing some issues related to type checking [\#1267](https://github.com/kokkos/kokkos-kernels/pull/1267) +- Restrict BsrMatrix specialization for AMPERE and VOLTA to CUDA [\#1242](https://github.com/kokkos/kokkos-kernels/pull/1242) +- Fix compilation errors for multi-vectors in kk_print_1Dview() [\#1231](https://github.com/kokkos/kokkos-kernels/pull/1231) +- src/batched: Fixes #1224 [\#1226](https://github.com/kokkos/kokkos-kernels/pull/1226) +- Fix SpGEMM crashing on empty rows [\#1220](https://github.com/kokkos/kokkos-kernels/pull/1220) +- Fix issue #1212 [\#1218](https://github.com/kokkos/kokkos-kernels/pull/1218) +- example/gmres: Specify half_t namespace [\#1208](https://github.com/kokkos/kokkos-kernels/pull/1208) +- Check that ordinal types are signed [\#1188](https://github.com/kokkos/kokkos-kernels/pull/1188) +- Fixing a couple of small issue with tensor core spmv [\#1185](https://github.com/kokkos/kokkos-kernels/pull/1185) +- Fix #threads setting in pcg for OpenMP [\#1182](https://github.com/kokkos/kokkos-kernels/pull/1182) +- SpMV: fix catch all case to avoid compiler warnings [\#1179](https://github.com/kokkos/kokkos-kernels/pull/1179) +- using namespace should be scoped to prevent name clashes [\#1177](https://github.com/kokkos/kokkos-kernels/pull/1177) +- using namespace should be scoped to prevent name clashes, see issue #1170 [\#1171](https://github.com/kokkos/kokkos-kernels/pull/1171) +- Fix bug with mkl impl of spgemm [\#1167](https://github.com/kokkos/kokkos-kernels/pull/1167) +- Add missing $ to KOKKOS_HAS_TRILINOS in sparse_sptrsv_superlu check [\#1160](https://github.com/kokkos/kokkos-kernels/pull/1160) +- Small fixes to spgemm, and plug gaps in testing [\#1159](https://github.com/kokkos/kokkos-kernels/pull/1159) +- SpMV: mismatch in #ifdef check and kernel specialization [\#1151](https://github.com/kokkos/kokkos-kernels/pull/1151) +- Fix values dimension for block sparse matrices [\#1147](https://github.com/kokkos/kokkos-kernels/pull/1147) + ## [3.5.00](https://github.com/kokkos/kokkos-kernels/tree/3.5.00) (2021-10-19) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/3.4.01...3.5.00) diff --git a/CMakeLists.txt b/CMakeLists.txt index 95f4cd0ee9..c4c8a3ccfa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,7 +10,7 @@ SET(KOKKOSKERNELS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) IF(NOT KOKKOSKERNELS_HAS_TRILINOS) - cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + cmake_minimum_required(VERSION 3.16 FATAL_ERROR) IF (Spack_WORKAROUND) #if we are explicitly using Spack for development, #nuke the Spack compiler @@ -24,14 +24,14 @@ IF(NOT KOKKOSKERNELS_HAS_TRILINOS) PROJECT(KokkosKernels CXX) ENDIF() SET(KokkosKernels_VERSION_MAJOR 3) - SET(KokkosKernels_VERSION_MINOR 5) + SET(KokkosKernels_VERSION_MINOR 6) SET(KokkosKernels_VERSION_PATCH 00) + SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") + MATH(EXPR KOKKOSKERNELS_VERSION "${KokkosKernels_VERSION_MAJOR} * 10000 + ${KokkosKernels_VERSION_MINOR} * 100 + ${KokkosKernels_VERSION_PATCH}") ENDIF() -IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") - CMAKE_POLICY(SET CMP0074 NEW) -ENDIF() +MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) INCLUDE(GNUInstallDirs) IF (KOKKOSKERNELS_HAS_TRILINOS) @@ -47,6 +47,8 @@ ENDIF() INCLUDE(cmake/fake_tribits.cmake) INCLUDE(cmake/kokkoskernels_tribits.cmake) +OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) + KOKKOSKERNELS_PACKAGE() IF (NOT KOKKOSKERNELS_HAS_TRILINOS) diff --git a/cm_generate_makefile.bash b/cm_generate_makefile.bash index d633a139c8..b26ba7be97 100755 --- a/cm_generate_makefile.bash +++ b/cm_generate_makefile.bash @@ -291,6 +291,7 @@ display_help_text() { echo "--ldflags=[FLAGS] Overwrite LDFLAGS for library build and test" echo " build. This will still set certain required" echo " flags (such as -fopenmp, -lpthread, etc.)." + echo "--shared: Build Kokkos and KokkosKernels as shared libraries (required for SYCL on Intel)" echo "--with-gtest=/Path/To/Gtest: Set path to gtest. (Used in unit and performance" echo " tests.)" echo "--with-hwloc=/Path/To/Hwloc: Set path to hwloc library." @@ -348,6 +349,9 @@ KOKKOSKERNELS_DO_TESTS=ON KOKKOSKERNELS_DO_PERFSUITE=OFF KOKKOSKERNELS_DO_EXAMPLES=ON +#Build static libraries by default +BUILD_SHARED_LIBRARIES=OFF + KOKKOS_MAKEINSTALL_J=4 KERNELS_DEFAULT_ETI_OPTION="" @@ -467,6 +471,9 @@ do --debug|-dbg) KOKKOSKERNELS_DEBUG=ON ;; + --shared) + BUILD_SHARED_LIBRARIES=ON + ;; --no-default-eti) KERNELS_DEFAULT_ETI_OPTION="-DKokkosKernels_ADD_DEFAULT_ETI=OFF" ;; @@ -731,9 +738,9 @@ cd ${KOKKOS_INSTALL_PATH} # Configure kokkos echo "" -echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +echo cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} echo "" -cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} +cmake $COMPILER_CMD -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${KOKKOS_INSTALL_PATH} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES}${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH} # Install kokkos library make install -j $KOKKOS_MAKEINSTALL_J @@ -758,7 +765,7 @@ cd $STORE_KOKKOSKERNELS_BUILD_PATH # Configure kokkos-kernels echo "" -echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +echo cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} echo "" -cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} +cmake $COMPILER_CMD -DKokkos_DIR="${KOKKOS_FIND_PATH}" -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_INSTALL_PREFIX="${PREFIX}" -DKokkosKernels_ENABLE_TESTS_AND_PERFSUITE=${KOKKOSKERNELS_DO_PERFSUITE} -DKokkosKernels_ENABLE_TESTS=${KOKKOSKERNELS_DO_TESTS} -DKokkosKernels_ENABLE_EXAMPLES:BOOL=${KOKKOSKERNELS_DO_EXAMPLES} ${KOKKOSKERNELS_SCALARS_CMD} ${KOKKOSKERNELS_ORDINALS_CMD} ${KOKKOSKERNELS_OFFSETS_CMD} ${KOKKOSKERNELS_LAYOUTS_CMD} ${KOKKOSKERNELS_TPLS_CMD} ${KOKKOSKERNELS_USER_TPL_PATH_CMD} ${KOKKOSKERNELS_USER_TPL_LIBNAME_CMD} ${KOKKOSKERNELS_EXTRA_LINKER_FLAGS_CMD} ${KOKKOSKERNELS_BUILDTYPE_CMD} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBRARIES} ${KOKKOSKERNELS_SPACES_CMD} ${KERNELS_DEFAULT_ETI_OPTION} ${KOKKOSKERNELS_PATH} diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index a36e745c71..2dcedcc1c9 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,6 +1,6 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms - LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL + LIB_OPTIONAL_TPLS quadmath MKL BLAS LAPACK CUSPARSE MAGMA METIS SuperLU Cholmod LAPACKE CBLAS ARMPL ROCBLAS ROCSPARSE TEST_OPTIONAL_TPLS yaml-cpp ) # NOTE: If you update names in LIB_OPTIONAL_TPLS above, make sure to map those names in diff --git a/cmake/KokkosKernelsConfig.cmake.in b/cmake/KokkosKernelsConfig.cmake.in index f930dd51a0..fbceffe76c 100644 --- a/cmake/KokkosKernelsConfig.cmake.in +++ b/cmake/KokkosKernelsConfig.cmake.in @@ -9,13 +9,5 @@ include(CMakeFindDependencyMacro) find_dependency(Kokkos HINTS @Kokkos_DIR@) -SET(Kokkos_ENABLE_OPENMP @Kokkos_ENABLE_OPENMP@) -SET(Kokkos_ENABLE_OPENMPTARGET @Kokkos_ENABLE_OPENMPTARGET@) -SET(Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) -SET(Kokkos_ENABLE_HIP @Kokkos_ENABLE_HIP@) -SET(Kokkos_ENABLE_SYCL @Kokkos_ENABLE_SYCL@) -SET(Kokkos_ENABLE_PTHREAD @Kokkos_ENABLE_PTHREAD@) -SET(Kokkos_ENABLE_SERIAL @Kokkos_ENABLE_SERIAL@) - INCLUDE("${KokkosKernels_CMAKE_DIR}/KokkosKernelsTargets.cmake") diff --git a/cmake/KokkosKernels_config.h.in b/cmake/KokkosKernels_config.h.in index adfd3cd118..f8dd2ae133 100644 --- a/cmake/KokkosKernels_config.h.in +++ b/cmake/KokkosKernels_config.h.in @@ -1,11 +1,15 @@ #ifndef KOKKOSKERNELS_CONFIG_H #define KOKKOSKERNELS_CONFIG_H - /* Define Fortran mangle from Trilinos macro definition */ -#ifndef F77_BLAS_MANGLE -# define F77_BLAS_MANGLE@F77_BLAS_MANGLE@ -#endif +// clang-format off +#ifndef F77_BLAS_MANGLE +#define F77_BLAS_MANGLE@F77_BLAS_MANGLE@ +#endif +// clang-format on + +/* Define the current version of Kokkos Kernels */ +#cmakedefine KOKKOSKERNELS_VERSION @KOKKOSKERNELS_VERSION@ /* Define if fortran blas 1 function can return complex type */ #cmakedefine KOKKOSKERNELS_TPL_BLAS_RETURN_COMPLEX @@ -22,7 +26,6 @@ than just BLAS and LAPACK functions. */ #cmakedefine HAVE_KOKKOSKERNELS_MKL - #cmakedefine KOKKOSKERNELS_ENABLE_TESTS_AND_PERFSUITE /* Define this macro if experimental features of Kokkoskernels are enabled */ @@ -61,11 +64,12 @@ /* Whether to build kernels for memory space Kokkos::HostSpace */ #cmakedefine KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE - /* Whether to build kernels for scalar type double */ #cmakedefine KOKKOSKERNELS_INST_DOUBLE /* Whether to build kernels for scalar type float */ #cmakedefine KOKKOSKERNELS_INST_FLOAT +/* Whether to build kernels for scalar type Kokkos::Experimental::half_t */ +#cmakedefine KOKKOSKERNELS_INST_HALF /* Whether to build kernels for scalar type complex */ #cmakedefine KOKKOSKERNELS_INST_COMPLEX_DOUBLE /* Whether to build kernels for scalar type complex */ @@ -119,25 +123,27 @@ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_METIS /* ARMPL */ #cmakedefine KOKKOSKERNELS_ENABLE_TPL_ARMPL +#cmakedefine ARMPL_BUILD @ARMPL_BUILD@ +/* ROCBLAS */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +/* ROCSPARSE */ +#cmakedefine KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE #cmakedefine KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV /* if MKL or ARMPL, BLAS is also defined */ -#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) ||\ +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) || \ defined(KOKKOSKERNELS_ENABLE_TPL_ARMPL) #if !defined(KOKKOSKERNELS_ENABLE_TPL_BLAS) #define KOKKOSKERNELS_ENABLE_TPL_BLAS #endif #endif -#if !defined(KOKKOS_ENABLE_CUDA) \ - && !defined(KOKKOS_ENABLE_HIP) \ - && !defined(KOKKOS_ENABLE_SYCL) \ - && !defined(KOKKOS_ENABLE_OPENMPTARGET) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOSKERNELS_ENABLE_HOST_ONLY #endif - /* * "Optimization level" for computational kernels in this subpackage. * The higher the level, the more code variants get generated, and @@ -145,11 +151,10 @@ * mean both better performance overall, and more uniform performance * for corner cases. */ -#define KOKKOSLINALG_OPT_LEVEL @KokkosLinAlg_Opt_Level@ +#define KOKKOSLINALG_OPT_LEVEL @KokkosLinAlg_Opt_Level @ #ifndef KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY false #endif - -#endif // KOKKOSKERNELS_CONFIG_H +#endif // KOKKOSKERNELS_CONFIG_H diff --git a/cmake/Modules/FindTPLARMPL.cmake b/cmake/Modules/FindTPLARMPL.cmake index 62e1e33ea3..6f56b0a884 100644 --- a/cmake/Modules/FindTPLARMPL.cmake +++ b/cmake/Modules/FindTPLARMPL.cmake @@ -14,6 +14,7 @@ ELSEIF (ARMPL_LIBRARIES) ELSEIF (ARMPL_LIBRARY_DIRS) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES amath ${ARMPL_LIB} LIBRARY_PATHS ${ARMPL_LIBRARY_DIRS}) ELSEIF (DEFINED ENV{ARMPL_DIR}) + SET(ARMPL_BUILD $ENV{ARMPL_BUILD}) SET(ARMPL_ROOT $ENV{ARMPL_DIR}) KOKKOSKERNELS_FIND_IMPORTED(ARMPL INTERFACE LIBRARIES diff --git a/cmake/Modules/FindTPLROCBLAS.cmake b/cmake/Modules/FindTPLROCBLAS.cmake new file mode 100644 index 0000000000..0217e8cf2c --- /dev/null +++ b/cmake/Modules/FindTPLROCBLAS.cmake @@ -0,0 +1,37 @@ +IF (ROCBLAS_LIBRARY_DIRS AND ROCBLAS_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES} LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +ELSEIF (ROCBLAS_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES ${ROCBLAS_LIBRARIES}) +ELSEIF (ROCBLAS_LIBRARY_DIRS) + KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE LIBRARIES rocblas LIBRARY_PATHS ${ROCBLAS_LIBRARY_DIRS}) +ELSEIF (KokkosKernels_ROCBLAS_ROOT) + KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE + LIBRARIES + rocblas + LIBRARY_PATHS + ${KokkosKernels_ROCBLAS_ROOT}/lib + HEADERS + rocblas.h + HEADER_PATHS + ${KokkosKernels_ROCBLAS_ROOT}/include + ) +ELSEIF (DEFINED ENV{ROCM_PATH}) + MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") + SET(ROCBLAS_ROOT "$ENV{ROCM_PATH}/rocblas") + KOKKOSKERNELS_FIND_IMPORTED(ROCBLAS INTERFACE + LIBRARIES + rocblas + LIBRARY_PATHS + ${ROCBLAS_ROOT}/lib + HEADERS + rocblas.h + HEADER_PATHS + ${ROCBLAS_ROOT}/include + ) +ELSE() + MESSAGE(ERROR "rocBLAS was not detected properly, please disable it or provide sufficient information at configure time.") + # Todo: figure out how to use the target defined during rocblas installation + # FIND_PACKAGE(ROCBLAS REQUIRED) + # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCBLAS INTERFACE LINK_LIBRARIES ${ROCBLAS_LIBRARIES}) + # GET_TARGET_PROPERTY(ROCBLAS_LINK_LIBRARIES ${ROCBLAS_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) +ENDIF() \ No newline at end of file diff --git a/cmake/Modules/FindTPLROCSPARSE.cmake b/cmake/Modules/FindTPLROCSPARSE.cmake new file mode 100644 index 0000000000..52a0261b48 --- /dev/null +++ b/cmake/Modules/FindTPLROCSPARSE.cmake @@ -0,0 +1,37 @@ +IF (ROCSPARSE_LIBRARY_DIRS AND ROCSPARSE_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES} LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +ELSEIF (ROCSPARSE_LIBRARIES) + KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES ${ROCSPARSE_LIBRARIES}) +ELSEIF (ROCSPARSE_LIBRARY_DIRS) + KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE LIBRARIES rocsparse LIBRARY_PATHS ${ROCSPARSE_LIBRARY_DIRS}) +ELSEIF (KokkosKernels_ROCSPARSE_ROOT) + KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE + LIBRARIES + rocsparse + LIBRARY_PATHS + ${KokkosKernels_ROCSPARSE_ROOT}/lib + HEADERS + rocsparse.h + HEADER_PATHS + ${KokkosKernels_ROCSPARSE_ROOT}/include + ) +ELSEIF (DEFINED ENV{ROCM_PATH}) + MESSAGE(STATUS "Detected ROCM_PATH: ENV{ROCM_PATH}") + SET(ROCSPARSE_ROOT "$ENV{ROCM_PATH}/rocsparse") + KOKKOSKERNELS_FIND_IMPORTED(ROCSPARSE INTERFACE + LIBRARIES + rocsparse + LIBRARY_PATHS + ${ROCSPARSE_ROOT}/lib + HEADERS + rocsparse.h + HEADER_PATHS + ${ROCSPARSE_ROOT}/include + ) +ELSE() + MESSAGE(ERROR "rocSPARSE was not detected properly, please disable it or provide sufficient information at configure time.") + # Todo: figure out how to use the target defined during rocsparse installation + # FIND_PACKAGE(ROCSPARSE REQUIRED) + # KOKKOSKERNELS_CREATE_IMPORTED_TPL(ROCSPARSE INTERFACE LINK_LIBRARIES ${ROCSPARSE_LIBRARIES}) + # GET_TARGET_PROPERTY(ROCSPARSE_LINK_LIBRARIES ${ROCSPARSE_LIBRARIES} IMPORTED_LINK_INTERFACE_LIBRARIES) +ENDIF() \ No newline at end of file diff --git a/cmake/kokkos_backends.cmake b/cmake/kokkos_backends.cmake index 11c51eed53..9346475f91 100644 --- a/cmake/kokkos_backends.cmake +++ b/cmake/kokkos_backends.cmake @@ -10,9 +10,15 @@ MACRO(CHECK_KOKKOS_BACKEND BE) ENDMACRO(CHECK_KOKKOS_BACKEND) CHECK_KOKKOS_BACKEND(SERIAL) -CHECK_KOKKOS_BACKEND(PTHREAD) +CHECK_KOKKOS_BACKEND(THREADS) CHECK_KOKKOS_BACKEND(OPENMP) CHECK_KOKKOS_BACKEND(OPENMPTARGET) CHECK_KOKKOS_BACKEND(CUDA) CHECK_KOKKOS_BACKEND(HIP) CHECK_KOKKOS_BACKEND(SYCL) + +# for backward compatibility. can be dropped when requiring Kokkos 3.6 +IF (Kokkos_ENABLE_PTHREAD) + SET(KOKKOS_ENABLE_THREADS ON) + SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT ON) +ENDIF() diff --git a/cmake/kokkoskernels_eti_devices.cmake b/cmake/kokkoskernels_eti_devices.cmake index 81ab89508e..47dce1f9d1 100644 --- a/cmake/kokkoskernels_eti_devices.cmake +++ b/cmake/kokkoskernels_eti_devices.cmake @@ -8,7 +8,7 @@ SET(EXEC_SPACES EXECSPACE_SYCL EXECSPACE_OPENMPTARGET EXECSPACE_OPENMP - EXECSPACE_PTHREAD + EXECSPACE_THREADS EXECSPACE_SERIAL ) SET(EXECSPACE_CUDA_CPP_TYPE Kokkos::Cuda) @@ -16,7 +16,7 @@ SET(EXECSPACE_HIP_CPP_TYPE Kokkos::Experimental::HIP) SET(EXECSPACE_SYCL_CPP_TYPE Kokkos::Experimental::SYCL) SET(EXECSPACE_OPENMPTARGET_CPP_TYPE Kokkos::Experimental::OpenMPTarget) SET(EXECSPACE_OPENMP_CPP_TYPE Kokkos::OpenMP) -SET(EXECSPACE_PTHREAD_CPP_TYPE Kokkos::Threads) +SET(EXECSPACE_THREADS_CPP_TYPE Kokkos::Threads) SET(EXECSPACE_SERIAL_CPP_TYPE Kokkos::Serial) SET(MEM_SPACES @@ -174,17 +174,17 @@ ENDIF() KOKKOSKERNELS_ADD_OPTION( INST_EXECSPACE_THREADS - ${KOKKOSKERNELS_INST_EXECSPACE_PTHREAD_DEFAULT} + ${KOKKOSKERNELS_INST_EXECSPACE_THREADS_DEFAULT} BOOL - "Whether to build kernels for the execution space Kokkos::Threads. If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_PTHREAD is enabled may increase build times. Default: ON if Kokkos is Threads-enabled, OFF otherwise." + "Whether to build kernels for the execution space Kokkos::Threads. If explicit template instantiation (ETI) is enabled in Trilinos, disabling this when Kokkos_ENABLE_THREADS is enabled may increase build times. Default: ON if Kokkos is Threads-enabled, OFF otherwise." ) #There continues to be name ambiguity with threads vs pthreads -SET(KOKKOSKERNELS_INST_EXECSPACE_PTHREAD ${KOKKOSKERNELS_INST_EXECSPACE_THREADS}) +SET(KOKKOSKERNELS_INST_EXECSPACE_THREADS ${KOKKOSKERNELS_INST_EXECSPACE_THREADS}) -IF(KOKKOSKERNELS_INST_EXECSPACE_PTHREAD AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE) +IF(KOKKOSKERNELS_INST_EXECSPACE_THREADS AND KOKKOSKERNELS_INST_MEMSPACE_HOSTSPACE) LIST(APPEND DEVICE_LIST "") - IF(NOT KOKKOS_ENABLE_PTHREAD) - MESSAGE(FATAL_ERROR "Set ETI on for PTHREAD, but Kokkos was not configured with the PTHREAD backend") + IF(NOT KOKKOS_ENABLE_THREADS) + MESSAGE(FATAL_ERROR "Set ETI on for THREADS, but Kokkos was not configured with the THREADS backend") ENDIF() ENDIF() @@ -201,7 +201,7 @@ SET(EXECSPACE_SYCL_VALID_MEM_SPACES SYCLSPACE SYCLSHAREDSPACE) SET(EXECSPACE_OPENMPTARGET_VALID_MEM_SPACES OPENMPTARGETSPACE) SET(EXECSPACE_SERIAL_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(EXECSPACE_OPENMP_VALID_MEM_SPACES HBWSPACE HOSTSPACE) -SET(EXECSPACE_PTHREAD_VALID_MEM_SPACES HBWSPACE HOSTSPACE) +SET(EXECSPACE_THREADS_VALID_MEM_SPACES HBWSPACE HOSTSPACE) SET(DEVICES) FOREACH(EXEC ${EXEC_SPACES}) IF (KOKKOSKERNELS_INST_${EXEC}) diff --git a/cmake/kokkoskernels_eti_floats.cmake b/cmake/kokkoskernels_eti_floats.cmake index 69e50af3cd..debf99bb0e 100644 --- a/cmake/kokkoskernels_eti_floats.cmake +++ b/cmake/kokkoskernels_eti_floats.cmake @@ -18,6 +18,13 @@ KOKKOSKERNELS_ADD_OPTION( "Whether to pre instantiate kernels for the scalar type float. Disabling this may increase build times. Default: OFF or unless enabled during a Trilinos build with Trilinos_ENABLE_FLOAT." ) +KOKKOSKERNELS_ADD_OPTION( + INST_HALF + OFF + BOOL + "Whether to pre instantiate kernels for the scalar type Kokkos::Experimental::half_t. Disabling this may increase build times. Default: OFF" +) + SET(FLOATS FLOAT DOUBLE @@ -25,6 +32,7 @@ SET(FLOATS COMPLEX_DOUBLE) SET(DOUBLE_CPP_TYPE "double") SET(FLOAT_CPP_TYPE "float") +SET(HALF_CPP_TYPE "Kokkos::Experimental::half_t") SET(COMPLEX_FLOAT_CPP_TYPE "Kokkos::complex") SET(COMPLEX_DOUBLE_CPP_TYPE "Kokkos::complex") @@ -63,6 +71,11 @@ IF (KOKKOSKERNELS_INST_FLOAT) LIST(APPEND SCALAR_LIST "float") ENDIF() +# TODO: Fix build errors in kokkos when half_t is used in ETI +#IF (KOKKOSKERNELS_INST_HALF) +# LIST(APPEND SCALAR_LIST "Kokkos::Experimental::half_t") +#ENDIF() + IF (KOKKOSKERNELS_INST_COMPLEX_DOUBLE) LIST(APPEND SCALAR_LIST "complex") ENDIF() diff --git a/cmake/kokkoskernels_tpls.cmake b/cmake/kokkoskernels_tpls.cmake index 15ff4e8bd6..f650168757 100644 --- a/cmake/kokkoskernels_tpls.cmake +++ b/cmake/kokkoskernels_tpls.cmake @@ -339,9 +339,7 @@ MACRO(kokkoskernels_export_imported_tpl NAME) ENDIF() SET(TPL_LINK_OPTIONS) - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0") - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${TPL_IMPORTED_NAME} INTERFACE_LINK_OPTIONS) - ENDIF() + GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${TPL_IMPORTED_NAME} INTERFACE_LINK_OPTIONS) IF(TPL_LINK_OPTIONS) KOKKOSKERNELS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") ENDIF() @@ -371,9 +369,7 @@ MACRO(kokkoskernels_import_tpl NAME) # I have still been getting errors about ROOT variables being ignored # I'm not sure if this is a scope issue - but make sure # the policy is set before we do any find_package calls - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - CMAKE_POLICY(SET CMP0074 NEW) - ENDIF() + CMAKE_POLICY(SET CMP0074 NEW) IF (KOKKOSKERNELS_ENABLE_TPL_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find @@ -391,16 +387,6 @@ MACRO(kokkoskernels_import_tpl NAME) ENDIF() ENDMACRO(kokkoskernels_import_tpl) -FUNCTION(TARGET_LINK_FLAGS_PORTABLE LIBRARY) - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13") - #great, this works the "right" way - TARGET_LINK_OPTIONS(${LIBRARY} ${ARGN}) - ELSE() - #bummer, this works the "hacky" way - TARGET_LINK_LIBRARIES(${LIBRARY} ${ARGN}) - ENDIF() -ENDFUNCTION(TARGET_LINK_FLAGS_PORTABLE) - FUNCTION(kokkoskernels_link_tpl TARGET) CMAKE_PARSE_ARGUMENTS(TPL "PUBLIC;PRIVATE;INTERFACE" @@ -466,6 +452,20 @@ KOKKOSKERNELS_ADD_TPL_OPTION(CUBLAS ${CUBLAS_DEFAULT} "Whether to enable C KOKKOSKERNELS_ADD_TPL_OPTION(CUSPARSE ${CUSPARSE_DEFAULT} "Whether to enable CUSPARSE" DEFAULT_DOCSTRING "ON if CUDA-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_OPTION(NO_DEFAULT_ROCM_TPLS OFF BOOL "Whether ROCM TPLs should be enabled by default. Default: OFF") +# Unlike CUDA, ROCm does not automatically install these TPLs +SET(ROCBLAS_DEFAULT OFF) +SET(ROCSPARSE_DEFAULT OFF) +# Since the default is OFF we do not really need this piece of logic here. +# IF(KOKKOSKERNELS_NO_DEFAULT_ROCM_TPLS) +# SET(ROCBLAS_DEFAULT OFF) +# SET(ROCSPARSE_DEFAULT OFF) +# ENDIF() +KOKKOSKERNELS_ADD_TPL_OPTION(ROCBLAS ${ROCBLAS_DEFAULT} "Whether to enable ROCBLAS" + DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") +KOKKOSKERNELS_ADD_TPL_OPTION(ROCSPARSE ${ROCSPARSE_DEFAULT} "Whether to enable ROCSPARSE" + DEFAULT_DOCSTRING "ON if HIP-enabled Kokkos, otherwise OFF") + IF (KOKKOSKERNELS_ENABLE_TPL_MAGMA) IF (F77_BLAS_MANGLE STREQUAL "(name,NAME) name ## _") SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DADD_ -fopenmp -lgfortran") @@ -501,6 +501,8 @@ IF (NOT KOKKOSKERNELS_HAS_TRILINOS) KOKKOSKERNELS_IMPORT_TPL(METIS) KOKKOSKERNELS_IMPORT_TPL(ARMPL) KOKKOSKERNELS_IMPORT_TPL(MAGMA) + KOKKOSKERNELS_IMPORT_TPL(ROCBLAS) + KOKKOSKERNELS_IMPORT_TPL(ROCSPARSE) ELSE () IF (Trilinos_ENABLE_SuperLU5_API) SET(HAVE_KOKKOSKERNELS_SUPERLU5_API TRUE) diff --git a/docs/Doxyfile.in b/docs/Doxyfile.in index 8e208bb937..8ca2c879ea 100644 --- a/docs/Doxyfile.in +++ b/docs/Doxyfile.in @@ -477,13 +477,13 @@ NUM_PROC_THREADS = 1 # normally produced when WARNINGS is set to YES. # The default value is: NO. -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. -EXTRACT_PRIVATE = NO +EXTRACT_PRIVATE = YES # If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual # methods of a class will be included in the documentation. @@ -495,13 +495,13 @@ EXTRACT_PRIV_VIRTUAL = NO # scope will be included in the documentation. # The default value is: NO. -EXTRACT_PACKAGE = NO +EXTRACT_PACKAGE = YES # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. -EXTRACT_STATIC = NO +EXTRACT_STATIC = YES # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, @@ -2236,7 +2236,7 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = +PREDEFINED = DOXY # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/docs/index.rst b/docs/index.rst index a728877de3..06240595bf 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -22,11 +22,12 @@ Indices and tables Docs ==== -.. doxygennamespace:: Kokkos - :members: .. doxygennamespace:: KokkosBlas + :project: KokkosKernels :members: .. doxygennamespace:: KokkosSparse + :project: KokkosKernels :members: .. doxygennamespace:: KokkosBatched + :project: KokkosKernels :members: \ No newline at end of file diff --git a/example/cmake/in-tree/CMakeLists.txt b/example/cmake/in-tree/CMakeLists.txt index 79dc09b06c..2192d78e29 100644 --- a/example/cmake/in-tree/CMakeLists.txt +++ b/example/cmake/in-tree/CMakeLists.txt @@ -1,6 +1,5 @@ -#Kokkos requires at least 3.10 -#but really you should use 3.12 -cmake_minimum_required (VERSION 3.10) +#Kokkos requires at least 3.16 +cmake_minimum_required (VERSION 3.16) project (MyProgram) diff --git a/example/cmake/install/CMakeLists.txt b/example/cmake/install/CMakeLists.txt index 44a1777e3c..51233783df 100644 --- a/example/cmake/install/CMakeLists.txt +++ b/example/cmake/install/CMakeLists.txt @@ -1,11 +1,8 @@ -#Kokkos requires at least 3.10 -#but really you should use 3.12 -cmake_minimum_required (VERSION 3.10) +#Kokkos requires at least 3.16 +cmake_minimum_required (VERSION 3.16) -IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") - CMAKE_POLICY(SET CMP0074 NEW) -ENDIF() +MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) project (MyProgram) diff --git a/example/fenl/CGSolve.hpp b/example/fenl/CGSolve.hpp index 3a3d2a3301..79c8badfeb 100644 --- a/example/fenl/CGSolve.hpp +++ b/example/fenl/CGSolve.hpp @@ -60,86 +60,80 @@ namespace Kokkos { namespace Example { -template< class ImportType , class SparseMatrixType , class VectorType , class TagType = void > -struct CGSolve ; - - -template< class ImportType , class SparseMatrixType , class VectorType > -struct CGSolve< ImportType , SparseMatrixType , VectorType , - typename std::enable_if<( - Kokkos::Impl::is_view< VectorType >::value && - VectorType::rank == 1 - )>::type > -{ - typedef typename VectorType::value_type scalar_type ; +template +struct CGSolve; + +template +struct CGSolve::value && + VectorType::rank == 1)>::type> { + typedef typename VectorType::value_type scalar_type; typedef typename VectorType::execution_space execution_space; - size_t iteration ; - double iter_time ; - double matvec_time ; - double norm_res ; - - CGSolve( const ImportType & import , - const SparseMatrixType & A , - const VectorType & b , - const VectorType & x , - const size_t maximum_iteration = 200 , - const double tolerance = std::numeric_limits::epsilon() ) - : iteration(0) - , iter_time(0) - , matvec_time(0) - , norm_res(0) - { - const size_t count_owned = import.count_owned ; + size_t iteration; + double iter_time; + double matvec_time; + double norm_res; + + CGSolve(const ImportType& import, const SparseMatrixType& A, + const VectorType& b, const VectorType& x, + const size_t maximum_iteration = 200, + const double tolerance = std::numeric_limits::epsilon()) + : iteration(0), iter_time(0), matvec_time(0), norm_res(0) { + const size_t count_owned = import.count_owned; const size_t count_total = import.count_owned + import.count_receive; // Need input vector to matvec to be owned + received - VectorType pAll ( "cg::p" , count_total ); + VectorType pAll("cg::p", count_total); - VectorType p = Kokkos::subview( pAll , std::pair(0,count_owned) ); - VectorType r ( "cg::r" , count_owned ); - VectorType Ap( "cg::Ap", count_owned ); + VectorType p = + Kokkos::subview(pAll, std::pair(0, count_owned)); + VectorType r("cg::r", count_owned); + VectorType Ap("cg::Ap", count_owned); /* r = b - A * x ; */ - /* p = x */ Kokkos::deep_copy( p , x ); - /* import p */ import( pAll ); - /* Ap = A * p */ KokkosSparse::spmv( "N" , 1.0 , A , pAll , 0.0 , Ap); - /* b - Ap => r */ KokkosBlas::update( 1.0 , b , -1.0 , Ap , 0.0 , r); - /* p = r */ Kokkos::deep_copy( p , r ); + /* p = x */ Kokkos::deep_copy(p, x); + /* import p */ import(pAll); + /* Ap = A * p */ KokkosSparse::spmv("N", 1.0, A, pAll, 0.0, Ap); + /* b - Ap => r */ KokkosBlas::update(1.0, b, -1.0, Ap, 0.0, r); + /* p = r */ Kokkos::deep_copy(p, r); - double old_rdot = Kokkos::Example::all_reduce( KokkosBlas::dot( r , r ) , import.comm ); + double old_rdot = + Kokkos::Example::all_reduce(KokkosBlas::dot(r, r), import.comm); - norm_res = sqrt( old_rdot ); - iteration = 0 ; + norm_res = sqrt(old_rdot); + iteration = 0; - Kokkos::Timer wall_clock ; + Kokkos::Timer wall_clock; Kokkos::Timer timer; - while ( tolerance < norm_res && iteration < maximum_iteration ) { - + while (tolerance < norm_res && iteration < maximum_iteration) { /* pAp_dot = dot( p , Ap = A * p ) */ timer.reset(); - /* import p */ import( pAll ); - /* Ap = A * p */ KokkosSparse::spmv( "N", 1.0, A , pAll, 0.0, Ap); + /* import p */ import(pAll); + /* Ap = A * p */ KokkosSparse::spmv("N", 1.0, A, pAll, 0.0, Ap); execution_space().fence(); matvec_time += timer.seconds(); - const double pAp_dot = Kokkos::Example::all_reduce( KokkosBlas::dot( p , Ap ) , import.comm ); - const double alpha = old_rdot / pAp_dot ; + const double pAp_dot = + Kokkos::Example::all_reduce(KokkosBlas::dot(p, Ap), import.comm); + const double alpha = old_rdot / pAp_dot; - /* x += alpha * p ; */ KokkosBlas::axpby( alpha, p , 1.0 , x ); - /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap , 1.0 , r ); + /* x += alpha * p ; */ KokkosBlas::axpby(alpha, p, 1.0, x); + /* r += -alpha * Ap ; */ KokkosBlas::axpby(-alpha, Ap, 1.0, r); - const double r_dot = Kokkos::Example::all_reduce( KokkosBlas::dot( r , r ) , import.comm ); - const double beta = r_dot / old_rdot ; + const double r_dot = + Kokkos::Example::all_reduce(KokkosBlas::dot(r, r), import.comm); + const double beta = r_dot / old_rdot; - /* p = r + beta * p ; */ KokkosBlas::axpby( 1.0 , r , beta , p ); + /* p = r + beta * p ; */ KokkosBlas::axpby(1.0, r, beta, p); - norm_res = std::sqrt( old_rdot = r_dot ); + norm_res = std::sqrt(old_rdot = r_dot); - ++iteration ; + ++iteration; } execution_space().fence(); @@ -147,12 +141,10 @@ struct CGSolve< ImportType , SparseMatrixType , VectorType , } }; -} // namespace Example -} // namespace Kokkos +} // namespace Example +} // namespace Kokkos //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- #endif /* #ifndef KOKKOS_EXAMPLE_CG_SOLVE */ - - diff --git a/example/fenl/main.cpp b/example/fenl/main.cpp index 67bf07d98e..ba99f0341e 100644 --- a/example/fenl/main.cpp +++ b/example/fenl/main.cpp @@ -65,153 +65,166 @@ //---------------------------------------------------------------------------- -enum { CMD_USE_THREADS = 0 - , CMD_USE_NUMA - , CMD_USE_CORE_PER_NUMA - , CMD_USE_CUDA - , CMD_USE_OPENMP - , CMD_USE_CUDA_DEV - , CMD_USE_FIXTURE_X - , CMD_USE_FIXTURE_Y - , CMD_USE_FIXTURE_Z - , CMD_USE_FIXTURE_BEGIN - , CMD_USE_FIXTURE_END - , CMD_USE_FIXTURE_QUADRATIC - , CMD_USE_ATOMIC - , CMD_USE_TRIALS - , CMD_VTUNE - , CMD_PRINT - , CMD_ECHO - , CMD_ERROR - , CMD_COUNT }; - -void print_cmdline( std::ostream & s , const int cmd[] ) -{ - if ( cmd[ CMD_USE_THREADS ] ) { - s << " Threads(" << cmd[ CMD_USE_THREADS ] - << ") NUMA(" << cmd[ CMD_USE_NUMA ] - << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ] - << ")" ; +enum { + CMD_USE_THREADS = 0, + CMD_USE_NUMA, + CMD_USE_CORE_PER_NUMA, + CMD_USE_CUDA, + CMD_USE_OPENMP, + CMD_USE_CUDA_DEV, + CMD_USE_FIXTURE_X, + CMD_USE_FIXTURE_Y, + CMD_USE_FIXTURE_Z, + CMD_USE_FIXTURE_BEGIN, + CMD_USE_FIXTURE_END, + CMD_USE_FIXTURE_QUADRATIC, + CMD_USE_ATOMIC, + CMD_USE_TRIALS, + CMD_VTUNE, + CMD_PRINT, + CMD_ECHO, + CMD_ERROR, + CMD_COUNT +}; + +void print_cmdline(std::ostream& s, const int cmd[]) { + if (cmd[CMD_USE_THREADS]) { + s << " Threads(" << cmd[CMD_USE_THREADS] << ") NUMA(" << cmd[CMD_USE_NUMA] + << ") CORE_PER_NUMA(" << cmd[CMD_USE_CORE_PER_NUMA] << ")"; } - if ( cmd[ CMD_USE_OPENMP ] ) { - s << " OpenMP(" << cmd[ CMD_USE_OPENMP ] - << ") NUMA(" << cmd[ CMD_USE_NUMA ] - << ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ] - << ")" ; + if (cmd[CMD_USE_OPENMP]) { + s << " OpenMP(" << cmd[CMD_USE_OPENMP] << ") NUMA(" << cmd[CMD_USE_NUMA] + << ") CORE_PER_NUMA(" << cmd[CMD_USE_CORE_PER_NUMA] << ")"; } - if ( cmd[ CMD_USE_FIXTURE_X ] ) { - s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ] - << "x" << cmd[ CMD_USE_FIXTURE_Y ] - << "x" << cmd[ CMD_USE_FIXTURE_Z ] - << ")" ; + if (cmd[CMD_USE_FIXTURE_X]) { + s << " Fixture(" << cmd[CMD_USE_FIXTURE_X] << "x" << cmd[CMD_USE_FIXTURE_Y] + << "x" << cmd[CMD_USE_FIXTURE_Z] << ")"; } - if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) { - s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ] - << " .. " << cmd[ CMD_USE_FIXTURE_END ] - << " )" ; + if (cmd[CMD_USE_FIXTURE_BEGIN]) { + s << " Fixture( " << cmd[CMD_USE_FIXTURE_BEGIN] << " .. " + << cmd[CMD_USE_FIXTURE_END] << " )"; } - if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { - s << " Quadratic-Element" ; + if (cmd[CMD_USE_FIXTURE_QUADRATIC]) { + s << " Quadratic-Element"; } - if ( cmd[ CMD_USE_CUDA ] ) { - s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ; + if (cmd[CMD_USE_CUDA]) { + s << " CUDA(" << cmd[CMD_USE_CUDA_DEV] << ")"; } - if ( cmd[ CMD_USE_ATOMIC ] ) { - s << " ATOMIC" ; + if (cmd[CMD_USE_ATOMIC]) { + s << " ATOMIC"; } - if ( cmd[ CMD_USE_TRIALS ] ) { - s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ; + if (cmd[CMD_USE_TRIALS]) { + s << " TRIALS(" << cmd[CMD_USE_TRIALS] << ")"; } - if ( cmd[ CMD_VTUNE ] ) { - s << " VTUNE" ; + if (cmd[CMD_VTUNE]) { + s << " VTUNE"; } - if ( cmd[ CMD_PRINT ] ) { - s << " PRINT" ; + if (cmd[CMD_PRINT]) { + s << " PRINT"; } - s << std::endl ; + s << std::endl; } -void print_perf_value( std::ostream & s , const std::vector & widths, const Kokkos::Example::FENL::Perf & perf ) -{ - int i=0; +void print_perf_value(std::ostream& s, const std::vector& widths, + const Kokkos::Example::FENL::Perf& perf) { + int i = 0; s << std::setw(widths[i++]) << perf.global_elem_count << " ,"; s << std::setw(widths[i++]) << perf.global_node_count << " ,"; s << std::setw(widths[i++]) << perf.newton_iter_count << " ,"; s << std::setw(widths[i++]) << perf.cg_iter_count << " ,"; s << std::setw(widths[i++]) << perf.map_ratio << " ,"; - s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,"; - s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,"; - s << std::setw(widths[i]) << perf.error_max; - s << std::endl ; + s << std::setw(widths[i++]) + << (perf.fill_node_set * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.scan_node_count * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.fill_graph_entries * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.sort_graph_entries * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.fill_element_graph * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.create_sparse_matrix * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.fill_time * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << (perf.bc_time * 1000.0) / perf.global_node_count << " ,"; + s << std::setw(widths[i++]) + << ((perf.matvec_time * 1000.0) / perf.cg_iter_count) / + perf.global_node_count + << " ,"; + s << std::setw(widths[i++]) + << ((perf.cg_time * 1000.0) / perf.cg_iter_count) / perf.global_node_count + << " ,"; + s << std::setw(widths[i]) << perf.error_max; + s << std::endl; } -template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder > -void run( MPI_Comm comm , const int cmd[] ) -{ - int comm_rank = 0 ; - int comm_size = 1 ; +template +void run(MPI_Comm comm, const int cmd[]) { + int comm_rank = 0; + int comm_size = 1; -#if defined( KOKKOS_ENABLE_MPI ) - MPI_Comm_rank( comm , & comm_rank ); - MPI_Comm_size( comm , & comm_size ); +#if defined(KOKKOS_ENABLE_MPI) + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); #else - comm = 0 ; + comm = 0; (void)comm_size; #endif + if (0 == comm_rank) { + if (cmd[CMD_USE_THREADS]) { + std::cout << "THREADS , " << cmd[CMD_USE_THREADS]; + } else if (cmd[CMD_USE_OPENMP]) { + std::cout << "OPENMP , " << cmd[CMD_USE_OPENMP]; + } else if (cmd[CMD_USE_CUDA]) { + std::cout << "CUDA"; + } - if ( 0 == comm_rank ) { - if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; } - else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; } - else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; } - - if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; } - else { std::cout << " , LINEAR-ELEMENT" ; } + if (cmd[CMD_USE_FIXTURE_QUADRATIC]) { + std::cout << " , QUADRATIC-ELEMENT"; + } else { + std::cout << " , LINEAR-ELEMENT"; + } - if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; } + if (cmd[CMD_USE_ATOMIC]) { + std::cout << " , USING ATOMICS"; + } } - std::vector< std::pair > headers; - - - headers.push_back(std::make_pair("ELEMS","count")); - headers.push_back(std::make_pair("NODES","count")); - headers.push_back(std::make_pair("NEWTON","iter")); - headers.push_back(std::make_pair("CG","iter")); - headers.push_back(std::make_pair("MAP_RATIO","ratio")); - headers.push_back(std::make_pair("SET_FILL/NODE","millisec")); - headers.push_back(std::make_pair("SCAN/NODE","millisec")); - headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec")); - headers.push_back(std::make_pair("SORT/NODE","millisec")); - headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec")); - headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec")); - headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec")); - headers.push_back(std::make_pair("BOUNDARY/NODE","millisec")); - headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec")); - headers.push_back(std::make_pair("CG/ITER/ROW","millisec")); - headers.push_back(std::make_pair("ERROR","ratio")); + std::vector > headers; + + headers.push_back(std::make_pair("ELEMS", "count")); + headers.push_back(std::make_pair("NODES", "count")); + headers.push_back(std::make_pair("NEWTON", "iter")); + headers.push_back(std::make_pair("CG", "iter")); + headers.push_back(std::make_pair("MAP_RATIO", "ratio")); + headers.push_back(std::make_pair("SET_FILL/NODE", "millisec")); + headers.push_back(std::make_pair("SCAN/NODE", "millisec")); + headers.push_back(std::make_pair("GRAPH_FILL/NODE", "millisec")); + headers.push_back(std::make_pair("SORT/NODE", "millisec")); + headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE", "millisec")); + headers.push_back(std::make_pair("MATRIX_CREATE/NODE", "millisec")); + headers.push_back(std::make_pair("MATRIX_FILL/NODE", "millisec")); + headers.push_back(std::make_pair("BOUNDARY/NODE", "millisec")); + headers.push_back(std::make_pair("MAT_VEC/ITER/ROW", "millisec")); + headers.push_back(std::make_pair("CG/ITER/ROW", "millisec")); + headers.push_back(std::make_pair("ERROR", "ratio")); // find print widths size_t min_width = 10; - std::vector< size_t > widths(headers.size()); - for (size_t i=0, ie=headers.size(); i widths(headers.size()); + for (size_t i = 0, ie = headers.size(); i < ie; ++i) + widths[i] = std::max(min_width, headers[i].first.size() + 1); // print column headers - if ( 0 == comm_rank ) { - std::cout << std::endl ; - for (size_t i=0; i - ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem ) - : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear > - ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem ) - ; - - if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf ); + cmd[CMD_USE_FIXTURE_QUADRATIC] + ? Kokkos::Example::FENL::fenl< + Device, Kokkos::Example::BoxElemPart::ElemQuadratic>( + comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], + cmd[CMD_USE_ATOMIC], nelem) + : Kokkos::Example::FENL::fenl< + Device, Kokkos::Example::BoxElemPart::ElemLinear>( + comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], + cmd[CMD_USE_ATOMIC], nelem); + + if (0 == comm_rank) print_perf_value(std::cout, widths, perf); } - } - else { - int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] , - cmd[ CMD_USE_FIXTURE_Y ] , - cmd[ CMD_USE_FIXTURE_Z ] }; + } else { + int nelem[3] = {cmd[CMD_USE_FIXTURE_X], cmd[CMD_USE_FIXTURE_Y], + cmd[CMD_USE_FIXTURE_Z]}; const Kokkos::Example::FENL::Perf perf = - cmd[ CMD_USE_FIXTURE_QUADRATIC ] - ? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic > - ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem ) - : Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear > - ( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem ) - ; - - if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf ); + cmd[CMD_USE_FIXTURE_QUADRATIC] + ? Kokkos::Example::FENL::fenl< + Device, Kokkos::Example::BoxElemPart::ElemQuadratic>( + comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], + cmd[CMD_USE_ATOMIC], nelem) + : Kokkos::Example::FENL::fenl< + Device, Kokkos::Example::BoxElemPart::ElemLinear>( + comm, cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], + cmd[CMD_USE_ATOMIC], nelem); + + if (0 == comm_rank) print_perf_value(std::cout, widths, perf); } } //---------------------------------------------------------------------------- -int main( int argc , char ** argv ) -{ - int comm_rank = 0 ; - int comm_size = 1 ; +int main(int argc, char** argv) { + int comm_rank = 0; + int comm_size = 1; -#if defined( KOKKOS_ENABLE_MPI ) - MPI_Init( & argc , & argv ); - MPI_Comm comm = MPI_COMM_WORLD ; - MPI_Comm_rank( comm , & comm_rank ); - MPI_Comm_size( comm , & comm_size ); +#if defined(KOKKOS_ENABLE_MPI) + MPI_Init(&argc, &argv); + MPI_Comm comm = MPI_COMM_WORLD; + MPI_Comm_rank(comm, &comm_rank); + MPI_Comm_size(comm, &comm_size); #else - MPI_Comm comm = 0 ; - (void) comm; - (void) comm_size; + MPI_Comm comm = 0; + (void)comm; + (void)comm_size; #endif - int cmdline[ CMD_COUNT ] ; - - for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ; - - if ( 0 == comm_rank ) { - for ( int i = 1 ; i < argc ; ++i ) { - if ( 0 == strcasecmp( argv[i] , "threads" ) ) { - cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] ); - } - else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) { - cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] ); - } - else if ( 0 == strcasecmp( argv[i] , "cores" ) ) { - sscanf( argv[++i] , "%dx%d" , - cmdline + CMD_USE_NUMA , - cmdline + CMD_USE_CORE_PER_NUMA ); - } - else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) { - cmdline[ CMD_USE_CUDA ] = 1 ; - } - else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) { - cmdline[ CMD_USE_CUDA ] = 1 ; - cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; - } - else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) { - sscanf( argv[++i] , "%dx%dx%d" , - cmdline + CMD_USE_FIXTURE_X , - cmdline + CMD_USE_FIXTURE_Y , - cmdline + CMD_USE_FIXTURE_Z ); - } - else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) { - sscanf( argv[++i] , "%d..%d" , - cmdline + CMD_USE_FIXTURE_BEGIN , - cmdline + CMD_USE_FIXTURE_END ); - } - else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) { - cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ; - } - else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) { - cmdline[ CMD_USE_ATOMIC ] = 1 ; - } - else if ( 0 == strcasecmp( argv[i] , "trials" ) ) { - cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ; - } - else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) { - cmdline[ CMD_VTUNE ] = 1 ; - } - else if ( 0 == strcasecmp( argv[i] , "print" ) ) { - cmdline[ CMD_PRINT ] = 1 ; - } - else if ( 0 == strcasecmp( argv[i] , "echo" ) ) { - cmdline[ CMD_ECHO ] = 1 ; - } - else { - cmdline[ CMD_ERROR ] = 1 ; - - std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ; + int cmdline[CMD_COUNT]; + + for (int i = 0; i < CMD_COUNT; ++i) cmdline[i] = 0; + + if (0 == comm_rank) { + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "threads")) { + cmdline[CMD_USE_THREADS] = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "openmp")) { + cmdline[CMD_USE_OPENMP] = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "cores")) { + sscanf(argv[++i], "%dx%d", cmdline + CMD_USE_NUMA, + cmdline + CMD_USE_CORE_PER_NUMA); + } else if (0 == Test::string_compare_no_case(argv[i], "cuda")) { + cmdline[CMD_USE_CUDA] = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "cuda-dev")) { + cmdline[CMD_USE_CUDA] = 1; + cmdline[CMD_USE_CUDA_DEV] = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "fixture")) { + sscanf(argv[++i], "%dx%dx%d", cmdline + CMD_USE_FIXTURE_X, + cmdline + CMD_USE_FIXTURE_Y, cmdline + CMD_USE_FIXTURE_Z); + } else if (0 == Test::string_compare_no_case(argv[i], "fixture-range")) { + sscanf(argv[++i], "%d..%d", cmdline + CMD_USE_FIXTURE_BEGIN, + cmdline + CMD_USE_FIXTURE_END); + } else if (0 == + Test::string_compare_no_case(argv[i], "fixture-quadratic")) { + cmdline[CMD_USE_FIXTURE_QUADRATIC] = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "atomic")) { + cmdline[CMD_USE_ATOMIC] = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "trials")) { + cmdline[CMD_USE_TRIALS] = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "vtune")) { + cmdline[CMD_VTUNE] = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "print")) { + cmdline[CMD_PRINT] = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "echo")) { + cmdline[CMD_ECHO] = 1; + } else { + cmdline[CMD_ERROR] = 1; + + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; } } - if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); } + if (cmdline[CMD_ECHO] && 0 == comm_rank) { + print_cmdline(std::cout, cmdline); + } } -#if defined( KOKKOS_ENABLE_MPI ) - MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm ); +#if defined(KOKKOS_ENABLE_MPI) + MPI_Bcast(cmdline, CMD_COUNT, MPI_INT, 0, comm); #endif - if ( cmdline[ CMD_VTUNE ] ) { + if (cmdline[CMD_VTUNE]) { std::stringstream cmd; - pid_t my_os_pid=getpid(); + pid_t my_os_pid = getpid(); const std::string vtune_loc = - "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl"; + "/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl"; const std::string output_dir = "./vtune/vtune."; - const int p_rank = comm_rank; - cmd << vtune_loc - << " -collect hotspots -result-dir " << output_dir << p_rank - << " -target-pid " << my_os_pid << " &"; - if (p_rank == 0) - std::cout << cmd.str() << std::endl; + const int p_rank = comm_rank; + cmd << vtune_loc << " -collect hotspots -result-dir " << output_dir + << p_rank << " -target-pid " << my_os_pid << " &"; + if (p_rank == 0) std::cout << cmd.str() << std::endl; system(cmd.str().c_str()); system("sleep 10"); } - if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) { - - if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; } - - if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) { - cmdline[ CMD_USE_FIXTURE_X ] = 2 ; - cmdline[ CMD_USE_FIXTURE_Y ] = 2 ; - cmdline[ CMD_USE_FIXTURE_Z ] = 2 ; + if (!cmdline[CMD_ERROR] && !cmdline[CMD_ECHO]) { + if (!cmdline[CMD_USE_TRIALS]) { + cmdline[CMD_USE_TRIALS] = 1; } -#if defined( KOKKOS_ENABLE_THREADS ) + if (!cmdline[CMD_USE_FIXTURE_X] && !cmdline[CMD_USE_FIXTURE_BEGIN]) { + cmdline[CMD_USE_FIXTURE_X] = 2; + cmdline[CMD_USE_FIXTURE_Y] = 2; + cmdline[CMD_USE_FIXTURE_Z] = 2; + } - if ( cmdline[ CMD_USE_THREADS ] ) { +#if defined(KOKKOS_ENABLE_THREADS) - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] , - cmdline[ CMD_USE_NUMA ] , - cmdline[ CMD_USE_CORE_PER_NUMA ] ); - } - else { - Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ); + if (cmdline[CMD_USE_THREADS]) { + if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) { + Kokkos::Threads::initialize(cmdline[CMD_USE_THREADS], + cmdline[CMD_USE_NUMA], + cmdline[CMD_USE_CORE_PER_NUMA]); + } else { + Kokkos::Threads::initialize(cmdline[CMD_USE_THREADS]); } - run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline ); + run(comm, + cmdline); Kokkos::Threads::finalize(); } #endif -#if defined( KOKKOS_ENABLE_OPENMP ) - - if ( cmdline[ CMD_USE_OPENMP ] ) { +#if defined(KOKKOS_ENABLE_OPENMP) - if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) { - Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] , - cmdline[ CMD_USE_NUMA ] , - cmdline[ CMD_USE_CORE_PER_NUMA ] ); - } - else { - Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ); + if (cmdline[CMD_USE_OPENMP]) { + if (cmdline[CMD_USE_NUMA] && cmdline[CMD_USE_CORE_PER_NUMA]) { + Kokkos::OpenMP::initialize(cmdline[CMD_USE_OPENMP], + cmdline[CMD_USE_NUMA], + cmdline[CMD_USE_CORE_PER_NUMA]); + } else { + Kokkos::OpenMP::initialize(cmdline[CMD_USE_OPENMP]); } - run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline ); + run(comm, + cmdline); Kokkos::OpenMP::finalize(); } #endif -#if defined( KOKKOS_ENABLE_CUDA ) - if ( cmdline[ CMD_USE_CUDA ] ) { +#if defined(KOKKOS_ENABLE_CUDA) + if (cmdline[CMD_USE_CUDA]) { // Use the last device: Kokkos::HostSpace::execution_space::initialize(); - Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) ); + Kokkos::Cuda::initialize( + Kokkos::Cuda::SelectDevice(cmdline[CMD_USE_CUDA_DEV])); - run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline ); + run(comm, + cmdline); Kokkos::Cuda::finalize(); Kokkos::HostSpace::execution_space::finalize(); } #endif - } -#if defined( KOKKOS_ENABLE_MPI ) +#if defined(KOKKOS_ENABLE_MPI) MPI_Finalize(); #endif - return cmdline[ CMD_ERROR ] ? -1 : 0 ; + return cmdline[CMD_ERROR] ? -1 : 0; } - diff --git a/example/gmres/CMakeLists.txt b/example/gmres/CMakeLists.txt index 15bfaac95d..4265fc4a5f 100644 --- a/example/gmres/CMakeLists.txt +++ b/example/gmres/CMakeLists.txt @@ -1,29 +1,29 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) -# Workaround https://github.com/kokkos/kokkos/issues/4376 for ibm/xl -IF (NOT ${KOKKOS_COMPILER_IBM}) KOKKOSKERNELS_ADD_EXECUTABLE( gmres_ex_real_A SOURCES ex_real_A.cpp ) +# FIXME_SYCL CUDA_ERROR_INVALID_ADDRESS_SPACE +IF(NOT KOKKOS_ENABLE_SYCL) KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( gmres_test_real_A SOURCES test_real_A.cpp ) +ENDIF() KOKKOSKERNELS_ADD_EXECUTABLE( gmres_test_cmplx_A SOURCES test_cmplx_A.cpp ) +# FIXME_SYCL CUDA_ERROR_INVALID_ADDRESS_SPACE +IF(NOT KOKKOS_ENABLE_SYCL) KOKKOSKERNELS_ADD_EXECUTABLE_AND_TEST( gmres_test_prec SOURCES test_prec.cpp ) - -ELSE () - MESSAGE (STATUS "SKIPPING gmres examples - Kokkos::complex unsupported with ibm/xlC as host compiler") -ENDIF () +ENDIF() diff --git a/example/gmres/gmres.hpp b/example/gmres/gmres.hpp index 1354e4637c..48a6e4ae0d 100644 --- a/example/gmres/gmres.hpp +++ b/example/gmres/gmres.hpp @@ -42,28 +42,57 @@ //@HEADER */ -#include -#include"KokkosKernels_IOUtils.hpp" -#include -#include -#include -#include -#include -#include +#include +#include "Kokkos_ArithTraits.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include +#include +#include +#include +#include +#include +#include "KokkosKernels_Error.hpp" //////////////////////////////////////////////////////////////////////////////// // libstdc++ half_t overloads //////////////////////////////////////////////////////////////////////////////// +namespace Kokkos { +namespace Issue1172WorkAround { #if !KOKKOS_HALF_T_IS_FLOAT -Kokkos::Experimental::half_t abs(Kokkos::Experimental::half_t arg) { - return arg < 0.0 ? -arg : arg; +Kokkos::Experimental::half_t fabs(Kokkos::Experimental::half_t arg) { + using AT = Kokkos::Details::ArithTraits; + return AT::abs(arg); } -Kokkos::complex abs(Kokkos::complex arg) noexcept { - return Kokkos::complex(abs(Kokkos::complex((double) arg.real(), (double) arg.imag()))); +Kokkos::Experimental::half_t fabs( + Kokkos::complex arg) noexcept { + return Kokkos::Experimental::half_t(Kokkos::abs( + Kokkos::complex((double)arg.real(), (double)arg.imag()))); } #endif // KOKKOS_HALF_T_IS_FLOAT +#if !KOKKOS_BHALF_T_IS_FLOAT +Kokkos::Experimental::bhalf_t fabs(Kokkos::Experimental::bhalf_t arg) { + using AT = Kokkos::Details::ArithTraits; + return AT::abs(arg); +} + +Kokkos::Experimental::bhalf_t fabs( + Kokkos::complex arg) noexcept { + return Kokkos::Experimental::bhalf_t(Kokkos::abs( + Kokkos::complex((double)arg.real(), (double)arg.imag()))); +} +#endif // KOKKOS_BHALF_T_IS_FLOAT + +// This fabs wrapper was added to resolve: +// https://github.com/kokkos/kokkos-kernels/issues/1172 +template +KOKKOS_INLINE_FUNCTION T fabs(const Kokkos::complex &x) { + return Kokkos::abs(x); +} +} // namespace Issue1172WorkAround +} // namespace Kokkos + // This struct is returned to the user to give solver // statistics and convergence status. struct GmresStats { @@ -72,53 +101,49 @@ struct GmresStats { enum FLAG { Conv, NoConv, LOA }; FLAG convFlagVal; std::string convFlag() { - switch(convFlagVal){ - case Conv: - return "Converged"; - case NoConv: - return "Not Converged"; - case LOA: - return "Solver has had loss of accuracy."; - default: - return "Flag not defined."; + switch (convFlagVal) { + case Conv: return "Converged"; + case NoConv: return "Not Converged"; + case LOA: return "Solver has had loss of accuracy."; + default: return "Flag not defined."; } } }; // This struct allows the user to pass in several -// options to the solver. -template< class ScalarType > -struct GmresOpts -{ - typename Kokkos::Details::ArithTraits::mag_type tol; - int m; - int maxRestart; - std::string ortho; - std::string precSide; - - GmresOpts(): - tol(1e-8), - m(50), - maxRestart(50), - ortho("CGS2") { } +// options to the solver. +template +struct GmresOpts { + typename Kokkos::Details::ArithTraits::mag_type tol; + int m; + int maxRestart; + std::string ortho; + std::string precSide; + + GmresOpts() : tol(1e-8), m(50), maxRestart(50), ortho("CGS2") {} }; -template< class ScalarType, class Layout, class EXSP, class OrdinalType = int > - GmresStats gmres( const KokkosSparse::CrsMatrix &A, - const Kokkos::View &B, - Kokkos::View &X, - const GmresOpts &opts, - const KokkosSparse::Experimental::Preconditioner * const M = NULL){ +template +GmresStats gmres( + const KokkosSparse::CrsMatrix &A, + const Kokkos::View &B, + Kokkos::View &X, + const GmresOpts &opts, + const KokkosSparse::Experimental::Preconditioner< + ScalarType, Layout, EXSP, OrdinalType> *const M = NULL) { + using namespace Kokkos::Issue1172WorkAround; // For 'fabs' wrappers above Kokkos::Profiling::pushRegion("GMRES::TotalTime:"); typedef Kokkos::Details::ArithTraits AT; - typedef typename AT::val_type ST; // So this code will run with ScalarType = std::complex. + typedef typename AT::val_type + ST; // So this code will run with ScalarType = std::complex. typedef typename AT::mag_type MT; - ST one = AT::one(); + ST one = AT::one(); ST zero = AT::zero(); - typedef Kokkos::View ViewVectorType; - typedef Kokkos::View ViewHostVectorType; - typedef Kokkos::View ViewMatrixType; + typedef Kokkos::View ViewVectorType; + typedef Kokkos::View + ViewHostVectorType; + typedef Kokkos::View ViewMatrixType; unsigned int n = A.numRows(); @@ -126,233 +151,263 @@ template< class ScalarType, class Layout, class EXSP, class OrdinalType = int > const int m = opts.m; // Check compatibility of dimensions at run time. - if ( n != unsigned(A.numCols()) ){ + if (n != unsigned(A.numCols())) { std::ostringstream os; os << "gmres: A must be a square matrix: " - << "numRows: " << n << " numCols: " << A.numCols(); - Kokkos::Impl::throw_runtime_exception (os.str ()); + << "numRows: " << n << " numCols: " << A.numCols(); + KokkosKernels::Impl::throw_runtime_exception(os.str()); } - if (X.extent(0) != B.extent(0) || - X.extent(0) != n ) { + if (X.extent(0) != B.extent(0) || X.extent(0) != n) { std::ostringstream os; os << "gmres: Dimensions of A, X, and B do not match: " << "A: " << n << " x " << n << ", X: " << X.extent(0) << "x 1, B: " << B.extent(0) << " x 1"; - Kokkos::Impl::throw_runtime_exception (os.str ()); + KokkosKernels::Impl::throw_runtime_exception(os.str()); } - //Check parameter validity: - if(m <= 0){ - throw std::invalid_argument("gmres: Please choose restart size m greater than zero."); + // Check parameter validity: + if (m <= 0) { + throw std::invalid_argument( + "gmres: Please choose restart size m greater than zero."); } - if(opts.maxRestart < 0){ - throw std::invalid_argument("gmres: Please choose maxRestart greater than zero."); + if (opts.maxRestart < 0) { + throw std::invalid_argument( + "gmres: Please choose maxRestart greater than zero."); } bool converged = false; - int cycle = 0; // How many times have we restarted? - int numIters = 0; //Number of iterations within the cycle before convergence. + int cycle = 0; // How many times have we restarted? + int numIters = 0; // Number of iterations within the cycle before + // convergence. MT nrmB, trueRes, relRes, shortRelRes; GmresStats myStats; std::cout << "Convergence tolerance is: " << opts.tol << std::endl; - ViewVectorType Xiter("Xiter",n); //Intermediate solution at iterations before restart. - ViewVectorType Res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Res"),n); //Residual vector - ViewVectorType Wj(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j"),n); //Tmp work vector 1 - ViewVectorType Wj2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j2"),n); //Tmp work vector 2 - ViewHostVectorType GVec_h(Kokkos::view_alloc(Kokkos::WithoutInitializing, "GVec"),m+1); - ViewMatrixType GLsSoln("GLsSoln",m,1);//LS solution vec for Givens Rotation. Must be 2-D for trsm. - typename ViewMatrixType::HostMirror GLsSoln_h = Kokkos::create_mirror_view(GLsSoln); //This one is needed for triangular solve. - ViewHostVectorType CosVal_h("CosVal",m); - ViewHostVectorType SinVal_h("SinVal",m); - ViewMatrixType V(Kokkos::view_alloc(Kokkos::WithoutInitializing, "V"),n,m+1); - ViewMatrixType VSub; //Subview of 1st m cols for updating soln. - ViewVectorType orthoTmp(Kokkos::view_alloc(Kokkos::WithoutInitializing, "orthoTmp"),m); - - ViewMatrixType H("H",m+1,m); //H matrix on device. Also used in Arn Rec debug. - typename ViewMatrixType::HostMirror H_h = Kokkos::create_mirror_view(H); //Make H into a host view of H. - - //Compute initial residuals: + ViewVectorType Xiter( + "Xiter", n); // Intermediate solution at iterations before restart. + ViewVectorType Res(Kokkos::view_alloc(Kokkos::WithoutInitializing, "Res"), + n); // Residual vector + ViewVectorType Wj(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j"), + n); // Tmp work vector 1 + ViewVectorType Wj2(Kokkos::view_alloc(Kokkos::WithoutInitializing, "W_j2"), + n); // Tmp work vector 2 + ViewHostVectorType GVec_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "GVec"), m + 1); + ViewMatrixType GLsSoln( + "GLsSoln", m, + 1); // LS solution vec for Givens Rotation. Must be 2-D for trsm. + typename ViewMatrixType::HostMirror GLsSoln_h = Kokkos::create_mirror_view( + GLsSoln); // This one is needed for triangular solve. + ViewHostVectorType CosVal_h("CosVal", m); + ViewHostVectorType SinVal_h("SinVal", m); + ViewMatrixType V(Kokkos::view_alloc(Kokkos::WithoutInitializing, "V"), n, + m + 1); + ViewMatrixType VSub; // Subview of 1st m cols for updating soln. + ViewVectorType orthoTmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "orthoTmp"), m); + + ViewMatrixType H("H", m + 1, + m); // H matrix on device. Also used in Arn Rec debug. + typename ViewMatrixType::HostMirror H_h = + Kokkos::create_mirror_view(H); // Make H into a host view of H. + + // Compute initial residuals: nrmB = KokkosBlas::nrm2(B); - Kokkos::deep_copy(Res,B); + Kokkos::deep_copy(Res, B); - //This is initial true residual, so don't need prec here. - KokkosSparse::spmv("N", one, A, X, zero, Wj); // wj = Ax - KokkosBlas::axpy(-one, Wj, Res); // res = res-Wj = b-Ax. + // This is initial true residual, so don't need prec here. + KokkosSparse::spmv("N", one, A, X, zero, Wj); // wj = Ax + KokkosBlas::axpy(-one, Wj, Res); // res = res-Wj = b-Ax. trueRes = KokkosBlas::nrm2(Res); - if( nrmB != 0 ){ - relRes = trueRes/nrmB; - } - else if( trueRes == 0 ){ + if (nrmB != 0) { + relRes = trueRes / nrmB; + } else if (trueRes == 0) { relRes = trueRes; - } - else{ //B is zero, but X has wrong initial guess. - Kokkos::deep_copy(X,0.0); + } else { // B is zero, but X has wrong initial guess. + Kokkos::deep_copy(X, 0.0); relRes = 0; } shortRelRes = relRes; std::cout << "Initial relative residual is: " << relRes << std::endl; - if( relRes < opts.tol ){ + if (relRes < opts.tol) { converged = true; } - while( !converged && cycle <= opts.maxRestart && shortRelRes >= 1e-14){ + while (!converged && cycle <= opts.maxRestart && shortRelRes >= 1e-14) { GVec_h(0) = trueRes; // Run Arnoldi iteration: - auto Vj = Kokkos::subview(V,Kokkos::ALL,0); - Kokkos::deep_copy(Vj,Res); - KokkosBlas::scal(Vj,one/trueRes,Vj); //V0 = V0/norm(V0) - - for (int j = 0; j < m; j++){ - if( M != NULL){ //Apply Right prec - M->apply(Vj, Wj2); // wj2 = M*Vj - KokkosSparse::spmv("N", one, A, Wj2, zero, Wj); //wj = A*MVj = A*Wj2 - } - else{ - KokkosSparse::spmv("N", one, A, Vj, zero, Wj); //wj = A*Vj + auto Vj = Kokkos::subview(V, Kokkos::ALL, 0); + Kokkos::deep_copy(Vj, Res); + KokkosBlas::scal(Vj, one / trueRes, Vj); // V0 = V0/norm(V0) + + for (int j = 0; j < m; j++) { + if (M != NULL) { // Apply Right prec + M->apply(Vj, Wj2); // wj2 = M*Vj + KokkosSparse::spmv("N", one, A, Wj2, zero, Wj); // wj = A*MVj = A*Wj2 + } else { + KokkosSparse::spmv("N", one, A, Vj, zero, Wj); // wj = A*Vj } Kokkos::Profiling::pushRegion("GMRES::Orthog:"); - if( opts.ortho == "MGS"){ - for (int i = 0; i <= j; i++){ - auto Vi = Kokkos::subview(V,Kokkos::ALL,i); - H_h(i,j) = KokkosBlas::dot(Vi,Wj); //Vi^* Wj - KokkosBlas::axpy(-H_h(i,j),Vi,Wj);//wj = wj-Hij*Vi + if (opts.ortho == "MGS") { + for (int i = 0; i <= j; i++) { + auto Vi = Kokkos::subview(V, Kokkos::ALL, i); + H_h(i, j) = KokkosBlas::dot(Vi, Wj); // Vi^* Wj + KokkosBlas::axpy(-H_h(i, j), Vi, Wj); // wj = wj-Hij*Vi } - auto Hj_h = Kokkos::subview(H_h,Kokkos::make_pair(0,j+1) ,j); - } - else if( opts.ortho == "CGS2"){ - auto V0j = Kokkos::subview(V,Kokkos::ALL,Kokkos::make_pair(0,j+1)); - auto Hj = Kokkos::subview(H,Kokkos::make_pair(0,j+1) ,j); - auto Hj_h = Kokkos::subview(H_h,Kokkos::make_pair(0,j+1) ,j); - KokkosBlas::gemv("C", one, V0j, Wj, zero, Hj); // Hj = Vj^T * wj - KokkosBlas::gemv("N", -one, V0j, Hj, one, Wj); // wj = wj - Vj * Hj - - //Re-orthog CGS: - auto orthoTmpSub = Kokkos::subview(orthoTmp,Kokkos::make_pair(0,j+1)); - KokkosBlas::gemv("C", one, V0j, Wj, zero, orthoTmpSub); // tmp (Hj) = Vj^T * wj - KokkosBlas::gemv("N", -one, V0j, orthoTmpSub, one, Wj); // wj = wj - Vj * tmp - KokkosBlas::axpy(one, orthoTmpSub, Hj); // Hj = Hj + tmp - Kokkos::deep_copy(Hj_h,Hj); - } - else { - throw std::invalid_argument("Invalid argument for 'ortho'. Please use 'CGS2' or 'MGS'."); + auto Hj_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1), j); + } else if (opts.ortho == "CGS2") { + auto V0j = Kokkos::subview(V, Kokkos::ALL, Kokkos::make_pair(0, j + 1)); + auto Hj = Kokkos::subview(H, Kokkos::make_pair(0, j + 1), j); + auto Hj_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1), j); + KokkosBlas::gemv("C", one, V0j, Wj, zero, Hj); // Hj = Vj^T * wj + KokkosBlas::gemv("N", -one, V0j, Hj, one, Wj); // wj = wj - Vj * Hj + + // Re-orthog CGS: + auto orthoTmpSub = + Kokkos::subview(orthoTmp, Kokkos::make_pair(0, j + 1)); + KokkosBlas::gemv("C", one, V0j, Wj, zero, + orthoTmpSub); // tmp (Hj) = Vj^T * wj + KokkosBlas::gemv("N", -one, V0j, orthoTmpSub, one, + Wj); // wj = wj - Vj * tmp + KokkosBlas::axpy(one, orthoTmpSub, Hj); // Hj = Hj + tmp + Kokkos::deep_copy(Hj_h, Hj); + } else { + throw std::invalid_argument( + "Invalid argument for 'ortho'. Please use 'CGS2' or 'MGS'."); } - MT tmpNrm = KokkosBlas::nrm2(Wj); - H_h(j+1,j) = tmpNrm; - if(tmpNrm > 1e-14){ - Vj = Kokkos::subview(V,Kokkos::ALL,j+1); - KokkosBlas::scal(Vj,one/H_h(j+1,j),Wj); // Vj = Wj/H(j+1,j) + MT tmpNrm = KokkosBlas::nrm2(Wj); + H_h(j + 1, j) = tmpNrm; + if (tmpNrm > 1e-14) { + Vj = Kokkos::subview(V, Kokkos::ALL, j + 1); + KokkosBlas::scal(Vj, one / H_h(j + 1, j), Wj); // Vj = Wj/H(j+1,j) } Kokkos::Profiling::popRegion(); - // Givens for real and complex (See Alg 3 in "On computing Givens rotations reliably and efficiently" - // by Demmel, et. al. 2001) - // Apply Givens rotation and compute shortcut residual: - for(int i=0; i= opts.tol){ - throw std::runtime_error("GMRES has experienced lucky breakdown, but the residual has not converged.\n\ + ST f = H_h(j, j); + ST g = H_h(j + 1, j); + MT f2 = AT::real(f) * AT::real(f) + AT::imag(f) * AT::imag(f); + MT g2 = AT::real(g) * AT::real(g) + AT::imag(g) * AT::imag(g); + ST fg2 = f2 + g2; + ST D1 = one / AT::sqrt(f2 * fg2); + CosVal_h(j) = f2 * D1; + fg2 = fg2 * D1; + H_h(j, j) = f * fg2; + SinVal_h(j) = f * D1 * AT::conj(g); + H_h(j + 1, j) = zero; + + GVec_h(j + 1) = GVec_h(j) * (-AT::conj(SinVal_h(j))); + GVec_h(j) = GVec_h(j) * CosVal_h(j); + shortRelRes = fabs(GVec_h(j + 1)) / nrmB; + + std::cout << "Shortcut relative residual for iteration " + << j + (cycle * m) << " is: " << shortRelRes << std::endl; + if (tmpNrm <= 1e-14 && shortRelRes >= opts.tol) { + throw std::runtime_error( + "GMRES has experienced lucky breakdown, but the residual has not converged.\n\ Solver terminated without convergence."); } - if( AT::isNan(ST(shortRelRes)) ){ - throw std::runtime_error("gmres: Relative residual is nan. Terminating solver."); + if (AT::isNan(ST(shortRelRes))) { + throw std::runtime_error( + "gmres: Relative residual is nan. Terminating solver."); } - //If short residual converged, or time to restart, check true residual - if( shortRelRes < opts.tol || j == m-1 ) { - //Compute least squares soln with Givens rotation: - auto GLsSolnSub_h = Kokkos::subview(GLsSoln_h,Kokkos::ALL,0); //Original view has rank 2, need a rank 1 here. - auto GVecSub_h = Kokkos::subview(GVec_h, Kokkos::make_pair(0,m)); - Kokkos::deep_copy(GLsSolnSub_h, GVecSub_h); //Copy LS rhs vec for triangle solve. - auto GLsSolnSub2_h = Kokkos::subview(GLsSoln_h,Kokkos::make_pair(0,j+1),Kokkos::ALL); - auto H_Sub_h = Kokkos::subview(H_h, Kokkos::make_pair(0,j+1), Kokkos::make_pair(0,j+1)); - KokkosBlas::trsm("L", "U", "N", "N", one, H_Sub_h, GLsSolnSub2_h); //GLsSoln = H\GLsSoln + // If short residual converged, or time to restart, check true residual + if (shortRelRes < opts.tol || j == m - 1) { + // Compute least squares soln with Givens rotation: + auto GLsSolnSub_h = Kokkos::subview( + GLsSoln_h, Kokkos::ALL, + 0); // Original view has rank 2, need a rank 1 here. + auto GVecSub_h = Kokkos::subview(GVec_h, Kokkos::make_pair(0, m)); + Kokkos::deep_copy(GLsSolnSub_h, + GVecSub_h); // Copy LS rhs vec for triangle solve. + auto GLsSolnSub2_h = Kokkos::subview( + GLsSoln_h, Kokkos::make_pair(0, j + 1), Kokkos::ALL); + auto H_Sub_h = Kokkos::subview(H_h, Kokkos::make_pair(0, j + 1), + Kokkos::make_pair(0, j + 1)); + KokkosBlas::trsm("L", "U", "N", "N", one, H_Sub_h, + GLsSolnSub2_h); // GLsSoln = H\GLsSoln Kokkos::deep_copy(GLsSoln, GLsSoln_h); - //Update solution and compute residual with Givens: - VSub = Kokkos::subview(V,Kokkos::ALL,Kokkos::make_pair(0,j+1)); - Kokkos::deep_copy(Xiter,X); //Can't overwrite X with intermediate solution. - auto GLsSolnSub3 = Kokkos::subview(GLsSoln,Kokkos::make_pair(0,j+1),0); - if(M != NULL){ //Apply right prec to correct soln. - KokkosBlas::gemv ("N", one, VSub, GLsSolnSub3, zero, Wj); //wj = V(1:j+1)*lsSoln - M->apply(Wj, Xiter, "N", one, one); //Xiter = M*wj + X - } - else{ - KokkosBlas::gemv ("N", one, VSub, GLsSolnSub3, one, Xiter); //x_iter = x + V(1:j+1)*lsSoln + // Update solution and compute residual with Givens: + VSub = Kokkos::subview(V, Kokkos::ALL, Kokkos::make_pair(0, j + 1)); + Kokkos::deep_copy(Xiter, + X); // Can't overwrite X with intermediate solution. + auto GLsSolnSub3 = + Kokkos::subview(GLsSoln, Kokkos::make_pair(0, j + 1), 0); + if (M != NULL) { // Apply right prec to correct soln. + KokkosBlas::gemv("N", one, VSub, GLsSolnSub3, zero, + Wj); // wj = V(1:j+1)*lsSoln + M->apply(Wj, Xiter, "N", one, one); // Xiter = M*wj + X + } else { + KokkosBlas::gemv("N", one, VSub, GLsSolnSub3, one, + Xiter); // x_iter = x + V(1:j+1)*lsSoln } - KokkosSparse::spmv("N", one, A, Xiter, zero, Wj); // wj = Ax - Kokkos::deep_copy(Res,B); // Reset r=b. - KokkosBlas::axpy(-one, Wj, Res); // r = b-Ax. + KokkosSparse::spmv("N", one, A, Xiter, zero, Wj); // wj = Ax + Kokkos::deep_copy(Res, B); // Reset r=b. + KokkosBlas::axpy(-one, Wj, Res); // r = b-Ax. trueRes = KokkosBlas::nrm2(Res); - relRes = trueRes/nrmB; - std::cout << "True relative residual for iteration " << j+(cycle*m) << " is : " << relRes << std::endl; - numIters = j+1; + relRes = trueRes / nrmB; + std::cout << "True relative residual for iteration " << j + (cycle * m) + << " is : " << relRes << std::endl; + numIters = j + 1; - if(relRes < opts.tol){ + if (relRes < opts.tol) { converged = true; - Kokkos::deep_copy(X, Xiter); //Final solution is the iteration solution. - break; //End Arnoldi iteration. - } - else if(shortRelRes < 1e-30){ - std::cout << "Short residual has converged to machine zero, but true residual is not converged.\n" - << "You may have given GMRES a singular matrix. Ending the GMRES iteration." + Kokkos::deep_copy( + X, Xiter); // Final solution is the iteration solution. + break; // End Arnoldi iteration. + } else if (shortRelRes < 1e-30) { + std::cout << "Short residual has converged to machine zero, but true " + "residual is not converged.\n" + << "You may have given GMRES a singular matrix. Ending the " + "GMRES iteration." << std::endl; - break; //End Arnoldi iteration; we can't make any more progress. + break; // End Arnoldi iteration; we can't make any more progress. } } - }//end Arnoldi iter. + } // end Arnoldi iter. cycle++; - //This is the end, or it's time to restart. Update solution to most recent vector. + // This is the end, or it's time to restart. Update solution to most + // recent vector. Kokkos::deep_copy(X, Xiter); } std::cout << "Ending relative residual is: " << relRes << std::endl; myStats.endRelRes = static_cast(relRes); - if( converged ){ + if (converged) { std::cout << "Solver converged! " << std::endl; myStats.convFlagVal = GmresStats::FLAG::Conv; - } - else if( shortRelRes < opts.tol ){ - std::cout << "Shortcut residual converged, but solver experienced a loss of accuracy." << std::endl; + } else if (shortRelRes < opts.tol) { + std::cout << "Shortcut residual converged, but solver experienced a loss " + "of accuracy." + << std::endl; myStats.convFlagVal = GmresStats::FLAG::LOA; - } - else{ + } else { std::cout << "Solver did not converge. :( " << std::endl; myStats.convFlagVal = GmresStats::FLAG::NoConv; } - if(cycle > 0){ - myStats.numIters = (cycle-1)*m + numIters; - } - else{ + if (cycle > 0) { + myStats.numIters = (cycle - 1) * m + numIters; + } else { myStats.numIters = 0; } - std::cout << "The solver completed " << myStats.numIters << " iterations." << std::endl; + std::cout << "The solver completed " << myStats.numIters << " iterations." + << std::endl; Kokkos::Profiling::popRegion(); return myStats; diff --git a/example/gmres/test_cmplx_A.cpp b/example/gmres/test_cmplx_A.cpp index 322273db15..a19d6ad7e1 100644 --- a/example/gmres/test_cmplx_A.cpp +++ b/example/gmres/test_cmplx_A.cpp @@ -42,114 +42,126 @@ //@HEADER */ -#include -#include"KokkosKernels_IOUtils.hpp" -#include -#include -#include -#include -#include +#include +#include "KokkosKernels_IOUtils.hpp" +#include +#include +#include +#include +#include -#include"gmres.hpp" +#include "gmres.hpp" -int main(int /*argc*/, char ** /*argv[]*/) { +int main(int /*argc*/, char** /*argv[]*/) { + typedef Kokkos::complex ST; + typedef int OT; + typedef Kokkos::DefaultExecutionSpace EXSP; - typedef Kokkos::complex ST; - typedef int OT; - typedef Kokkos::DefaultExecutionSpace EXSP; + using ViewVectorType = Kokkos::View; - using ViewVectorType = Kokkos::View; - - std::string filename("young1c.mtx"); // example matrix + std::string filename("young1c.mtx"); // example matrix GmresOpts solverOpts; - solverOpts.m = 100; //Max subspace size before restarting. - solverOpts.tol = 1e-05; //Relative residual convergence tolerance. + solverOpts.m = 100; // Max subspace size before restarting. + solverOpts.tol = 1e-05; // Relative residual convergence tolerance. solverOpts.maxRestart = 60; - solverOpts.ortho = "CGS2"; //orthog type - bool pass1 = false; - bool pass2 = false; + solverOpts.ortho = "CGS2"; // orthog type + bool pass1 = false; + bool pass2 = false; std::cout << "File to process is: " << filename << std::endl; std::cout << "Convergence tolerance is: " << solverOpts.tol << std::endl; - //Initialize Kokkos AFTER parsing parameters: + // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - - // Read in a matrix Market file and use it to test the Kokkos Operator. - KokkosSparse::CrsMatrix A = - KokkosKernels::Impl::read_kokkos_crst_matrix>(filename.c_str()); - - int n = A.numRows(); - ViewVectorType X("X",n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - - std::cout << "Testing GMRES with CGS2 ortho:" << std::endl; - GmresStats solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - double nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - double endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=======================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - - if( solveStats.numIters < 700 && solveStats.numIters > 600 && endRes < solverOpts.tol){ - std::cout << "Test CGS2 Passed!" << std::endl; - pass1 = true; + // Read in a matrix Market file and use it to test the Kokkos Operator. + KokkosSparse::CrsMatrix A = + KokkosKernels::Impl::read_kokkos_crst_matrix< + KokkosSparse::CrsMatrix>(filename.c_str()); + + int n = A.numRows(); + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + std::cout << "Testing GMRES with CGS2 ortho:" << std::endl; + GmresStats solveStats = + gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + double nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + double endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=======================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + + if (solveStats.numIters < 700 && solveStats.numIters > 600 && + endRes < solverOpts.tol) { + std::cout << "Test CGS2 Passed!" << std::endl; + pass1 = true; + } else { + std::cout << "Solver did not converge within the expected number of " + "iterations. " + << std::endl + << "CGS2 Test Failed." << std::endl; } - else{ - std::cout << "Solver did not converge within the expected number of iterations. " << std::endl - << "CGS2 Test Failed." << std::endl; - } - std::cout << "=======================================" << std::endl << std::endl << std::endl; - - solverOpts.ortho = "MGS"; - Kokkos::deep_copy(X,0.0); - Kokkos::deep_copy(B,1.0); - - std::cout << "Testing GMRES with MGS ortho:" << std::endl; - solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - nrmB = KokkosBlas::nrm2(B); - KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax - KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. - endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=======================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - - if( solveStats.numIters < 700 && solveStats.numIters > 600 && endRes < solverOpts.tol){ - std::cout << "Test MGS Passed!" << std::endl; - if( pass1 ){ pass2 = true; }; - } - else{ - std::cout << "Solver did not converge within the expected number of iterations. " << std::endl - << "MGS Test Failed." << std::endl; - } - std::cout << "=======================================" << std::endl << std::endl << std::endl; - + std::cout << "=======================================" << std::endl + << std::endl + << std::endl; + + solverOpts.ortho = "MGS"; + Kokkos::deep_copy(X, 0.0); + Kokkos::deep_copy(B, 1.0); + + std::cout << "Testing GMRES with MGS ortho:" << std::endl; + solveStats = gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + nrmB = KokkosBlas::nrm2(B); + KokkosSparse::spmv("N", 1.0, A, X, 0.0, Wj); // wj = Ax + KokkosBlas::axpy(-1.0, Wj, B); // b = b-Ax. + endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=======================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + + if (solveStats.numIters < 700 && solveStats.numIters > 600 && + endRes < solverOpts.tol) { + std::cout << "Test MGS Passed!" << std::endl; + if (pass1) { + pass2 = true; + }; + } else { + std::cout << "Solver did not converge within the expected number of " + "iterations. " + << std::endl + << "MGS Test Failed." << std::endl; + } + std::cout << "=======================================" << std::endl + << std::endl + << std::endl; } Kokkos::finalize(); - if(pass2){ - std::cout << "Both tests have passed!!" << std::endl; - } - else{ + if (pass2) { + std::cout << "Both tests have passed!!" << std::endl; + } else { std::cout << "One or more tests has failed." << std::endl; } - return ( pass2 ? EXIT_SUCCESS : EXIT_FAILURE ); + return (pass2 ? EXIT_SUCCESS : EXIT_FAILURE); } - diff --git a/example/gmres/test_real_A.cpp b/example/gmres/test_real_A.cpp index 1a0bb09683..3f6edd06a3 100644 --- a/example/gmres/test_real_A.cpp +++ b/example/gmres/test_real_A.cpp @@ -42,18 +42,18 @@ //@HEADER */ -#include -#include"KokkosKernels_IOUtils.hpp" -#include -#include -#include -#include -#include +#include +#include "KokkosKernels_IOUtils.hpp" +#include +#include +#include +#include +#include #include -#include"gmres.hpp" +#include "gmres.hpp" -int main(int /*argc*/, char ** /*argv[]*/) { +int main(int /*argc*/, char** /*argv[]*/) { typedef double ST; typedef int OT; typedef Kokkos::DefaultExecutionSpace EXSP; @@ -70,101 +70,111 @@ int main(int /*argc*/, char ** /*argv[]*/) { GmresOpts solverOpts; solverOpts.ortho = "CGS2"; // orthog type solverOpts.m = 15; // Max subspace size before restarting. - solverOpts.tol = 1e-10; //Relative residual convergence tolerance. + solverOpts.tol = 1e-10; // Relative residual convergence tolerance. solverOpts.maxRestart = 50; - bool pass1 = false; - bool pass2 = false; + bool pass1 = false; + bool pass2 = false; std::cout << "Convergence tolerance is: " << solverOpts.tol << std::endl; // Initialize Kokkos AFTER parsing parameters: Kokkos::initialize(); { - // Create a diagonally dominant sparse matrix to test: - ncST nnz; - cOT n = 5000; - cOT numRows = n; - cOT numCols = n; - cOT diagDominance = 1; - nnz = 10 * numRows; - sp_matrix_type A = KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix - (numRows, numCols, nnz, 0, ncOT(0.01 * numRows), diagDominance); - - // Set initial vectors: - ViewVectorType X("X",n); //Solution and initial guess - ViewVectorType Wj("Wj",n); //For checking residuals at end. - ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"),n);//right-hand side vec - - // Make rhs ones so that results are repeatable: - Kokkos::deep_copy(B,1.0); - - std::cout << "Testing GMRES with CGS2 ortho:" << std::endl; - GmresStats solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - double nrmB = static_cast(KokkosBlas::nrm2(B)); - KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax - KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax. - double endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=======================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - - if (solveStats.numIters < 40 && solveStats.numIters > 20 && - endRes < static_cast(solverOpts.tol)) { - std::cout << "Test CGS2 Passed!" << std::endl; - pass1 = true; - } else { - std::cout - << "Solver did not converge within the expected number of iterations. " - << std::endl - << "CGS2 Test Failed." << std::endl; - } - std::cout << "=======================================" << std::endl << std::endl << std::endl; - - solverOpts.ortho = "MGS"; - Kokkos::deep_copy(X,0.0); - Kokkos::deep_copy(B,1.0); - - std::cout << "Testing GMRES with MGS ortho:" << std::endl; - solveStats = gmres(A, B, X, solverOpts); - - // Double check residuals at end of solve: - nrmB = static_cast(KokkosBlas::nrm2(B)); - KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax - KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax. - endRes = KokkosBlas::nrm2(B)/nrmB; - std::cout << "=======================================" << std::endl; - std::cout << "Verify from main: Ending residual is " << endRes << std::endl; - std::cout << "Number of iterations is: " << solveStats.numIters << std::endl; - std::cout << "Diff of residual from main - residual from solver: " << solveStats.endRelRes - endRes << std::endl; - std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; - - if (solveStats.numIters < 40 && solveStats.numIters > 20 && - endRes < static_cast(solverOpts.tol)) { - std::cout << "Test MGS Passed!" << std::endl; - if (pass1) { - pass2 = true; - }; - } else { - std::cout - << "Solver did not converge within the expected number of iterations. " - << std::endl - << "MGS Test Failed." << std::endl; - } - std::cout << "=======================================" << std::endl << std::endl << std::endl; - + // Create a diagonally dominant sparse matrix to test: + ncST nnz; + cOT n = 5000; + cOT numRows = n; + cOT numCols = n; + cOT diagDominance = 1; + nnz = 10 * numRows; + sp_matrix_type A = + KokkosKernels::Impl::kk_generate_diagonally_dominant_sparse_matrix< + sp_matrix_type>(numRows, numCols, nnz, 0, ncOT(0.01 * numRows), + diagDominance); + + // Set initial vectors: + ViewVectorType X("X", n); // Solution and initial guess + ViewVectorType Wj("Wj", n); // For checking residuals at end. + ViewVectorType B(Kokkos::view_alloc(Kokkos::WithoutInitializing, "B"), + n); // right-hand side vec + + // Make rhs ones so that results are repeatable: + Kokkos::deep_copy(B, 1.0); + + std::cout << "Testing GMRES with CGS2 ortho:" << std::endl; + GmresStats solveStats = + gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + double nrmB = static_cast(KokkosBlas::nrm2(B)); + KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax + KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax. + double endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=======================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + + if (solveStats.numIters < 40 && solveStats.numIters > 20 && + endRes < static_cast(solverOpts.tol)) { + std::cout << "Test CGS2 Passed!" << std::endl; + pass1 = true; + } else { + std::cout << "Solver did not converge within the expected number of " + "iterations. " + << std::endl + << "CGS2 Test Failed." << std::endl; + } + std::cout << "=======================================" << std::endl + << std::endl + << std::endl; + + solverOpts.ortho = "MGS"; + Kokkos::deep_copy(X, 0.0); + Kokkos::deep_copy(B, 1.0); + + std::cout << "Testing GMRES with MGS ortho:" << std::endl; + solveStats = gmres(A, B, X, solverOpts); + + // Double check residuals at end of solve: + nrmB = static_cast(KokkosBlas::nrm2(B)); + KokkosSparse::spmv("N", ST(1.0), A, X, ST(0.0), Wj); // wj = Ax + KokkosBlas::axpy(ST(-1.0), Wj, B); // b = b-Ax. + endRes = KokkosBlas::nrm2(B) / nrmB; + std::cout << "=======================================" << std::endl; + std::cout << "Verify from main: Ending residual is " << endRes << std::endl; + std::cout << "Number of iterations is: " << solveStats.numIters + << std::endl; + std::cout << "Diff of residual from main - residual from solver: " + << solveStats.endRelRes - endRes << std::endl; + std::cout << "Convergence flag is : " << solveStats.convFlag() << std::endl; + + if (solveStats.numIters < 40 && solveStats.numIters > 20 && + endRes < static_cast(solverOpts.tol)) { + std::cout << "Test MGS Passed!" << std::endl; + if (pass1) { + pass2 = true; + }; + } else { + std::cout << "Solver did not converge within the expected number of " + "iterations. " + << std::endl + << "MGS Test Failed." << std::endl; + } + std::cout << "=======================================" << std::endl + << std::endl + << std::endl; } Kokkos::finalize(); - if(pass2){ - std::cout << "Both tests have passed!!" << std::endl; - } - else{ + if (pass2) { + std::cout << "Both tests have passed!!" << std::endl; + } else { std::cout << "One or more tests has failed." << std::endl; } - return ( pass2 ? EXIT_SUCCESS : EXIT_FAILURE ); + return (pass2 ? EXIT_SUCCESS : EXIT_FAILURE); } diff --git a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp index fa788b4daa..99b398e40c 100644 --- a/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp +++ b/example/graph/KokkosKernels_Example_Distance2GraphColor.cpp @@ -57,10 +57,8 @@ #include #include - using namespace KokkosGraph; - #ifdef KOKKOSKERNELS_INST_DOUBLE using kk_scalar_type = double; #else @@ -73,7 +71,7 @@ using kk_scalar_type = float; using kk_size_type = int; #else #ifdef KOKKOSKERNELS_INST_OFFSET_SIZE_T -using kk_size_type = size_t; +using kk_size_type = size_t; #endif #endif @@ -81,486 +79,483 @@ using kk_size_type = size_t; using kk_lno_type = int; #else #ifdef KOKKOSKERNELS_INST_ORDINAL_INT64_T -using kk_lno_type = int64_t; +using kk_lno_type = int64_t; #endif #endif - - using namespace KokkosGraph; namespace KokkosKernels { namespace Example { - - -struct Parameters -{ - int algorithm; - int repeat; - int chunk_size; - int output_graphviz_vert_max; - int output_graphviz; - int shmemsize; - int verbose_level; - int check_output; - char* coloring_input_file; - char* coloring_output_file; - int output_histogram; - int use_threads; - int use_openmp; - int use_cuda; - int use_serial; - int validate; - char* mtx_bin_file; - - Parameters() - { - algorithm = 0; - repeat = 6; - chunk_size = -1; - shmemsize = 16128; - verbose_level = 0; - check_output = 0; - coloring_input_file = NULL; - coloring_output_file = NULL; - output_histogram = 0; - output_graphviz = 0; - output_graphviz_vert_max = 1500; - use_threads = 0; - use_openmp = 0; - use_cuda = 0; - use_serial = 0; - validate = 0; - mtx_bin_file = NULL; - } +struct Parameters { + int algorithm; + int repeat; + int chunk_size; + int output_graphviz_vert_max; + int output_graphviz; + int shmemsize; + int verbose_level; + int check_output; + char* coloring_input_file; + char* coloring_output_file; + int output_histogram; + int use_threads; + int use_openmp; + int use_cuda; + int use_serial; + int validate; + char* mtx_bin_file; + + Parameters() { + algorithm = 0; + repeat = 6; + chunk_size = -1; + shmemsize = 16128; + verbose_level = 0; + check_output = 0; + coloring_input_file = NULL; + coloring_output_file = NULL; + output_histogram = 0; + output_graphviz = 0; + output_graphviz_vert_max = 1500; + use_threads = 0; + use_openmp = 0; + use_cuda = 0; + use_serial = 0; + validate = 0; + mtx_bin_file = NULL; + } }; - - -void -print_options(std::ostream& os, const char* app_name, unsigned int indent = 0) -{ - std::string spaces(indent, ' '); - os << "Usage:" << std::endl - << spaces << " " << app_name << " [parameters]" << std::endl - << std::endl - << spaces << "Parameters:" << std::endl - << spaces << " Parallelism (select one of the following):" << std::endl - << spaces << " --serial Execute serially." << std::endl - << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." << std::endl - << spaces << " --cuda Use CUDA" << std::endl - << std::endl - << spaces << " Required Parameters:" << std::endl - << spaces << " --amtx Input file in Matrix Market format (.mtx)." << std::endl - << std::endl - << spaces << " --algorithm Set the algorithm to use. Allowable values are:" << std::endl - << spaces << " COLORING_D2_MATRIX_SQUARED - Matrix-squared + Distance-1 method." << std::endl - << spaces << " COLORING_D2_SERIAL - Serial algorithm (must use with 'serial' mode)" << std::endl - << spaces << " COLORING_D2_VB - Vertex Based method using boolean forbidden array (Default)." << std::endl - << spaces << " COLORING_D2_VB_BIT - VB with Bitvector Forbidden Array" << std::endl - << spaces << " COLORING_D2_VB_BIT_EF - VB_BIT with Edge Filtering" << std::endl - << std::endl - << spaces << " Optional Parameters:" << std::endl - << spaces << " --output-histogram Print out a histogram of the colors." << std::endl - << spaces << " --output-graphviz Write the output to a graphviz file (G.dot)." << std::endl - << spaces << " Note: Vertices with color 0 will be filled in and colored" << std::endl - << spaces << " --output-graphviz-vert-max Upper limit of vertices in G to allow graphviz output. Default=1500." << std::endl - << spaces << " Requires --output-graphviz to also be enabled." << std::endl - << spaces << " --validate Check that the coloring is a valid distance-2 graph coloring" << std::endl - << spaces << " --verbose-level Set verbosity level [0..5] where N > 0 means print verbose messags." << std::endl - << spaces << " Default: 0" << std::endl - << spaces << " --help Print out command line help." << std::endl - << spaces << " " << std::endl; +void print_options(std::ostream& os, const char* app_name, + unsigned int indent = 0) { + std::string spaces(indent, ' '); + os << "Usage:" << std::endl + << spaces << " " << app_name << " [parameters]" << std::endl + << std::endl + << spaces << "Parameters:" << std::endl + << spaces << " Parallelism (select one of the following):" << std::endl + << spaces << " --serial Execute serially." << std::endl + << spaces << " --threads Use N posix threads." << std::endl + << spaces << " --openmp Use OpenMP with N threads." + << std::endl + << spaces << " --cuda Use CUDA" << std::endl + << std::endl + << spaces << " Required Parameters:" << std::endl + << spaces + << " --amtx Input file in Matrix Market format (.mtx)." + << std::endl + << std::endl + << spaces + << " --algorithm Set the algorithm to use. " + "Allowable values are:" + << std::endl + << spaces + << " COLORING_D2_MATRIX_SQUARED - Matrix-squared + " + "Distance-1 method." + << std::endl + << spaces + << " COLORING_D2_SERIAL - Serial algorithm (must " + "use with 'serial' mode)" + << std::endl + << spaces + << " COLORING_D2_VB - Vertex Based method " + "using boolean forbidden array (Default)." + << std::endl + << spaces + << " COLORING_D2_VB_BIT - VB with Bitvector " + "Forbidden Array" + << std::endl + << spaces + << " COLORING_D2_VB_BIT_EF - VB_BIT with Edge " + "Filtering" + << std::endl + << std::endl + << spaces << " Optional Parameters:" << std::endl + << spaces + << " --output-histogram Print out a histogram of the " + "colors." + << std::endl + << spaces + << " --output-graphviz Write the output to a graphviz " + "file (G.dot)." + << std::endl + << spaces + << " Note: Vertices with color 0 " + "will be filled in and colored" + << std::endl + << spaces + << " --output-graphviz-vert-max Upper limit of vertices in G to " + "allow graphviz output. Default=1500." + << std::endl + << spaces + << " Requires --output-graphviz to " + "also be enabled." + << std::endl + << spaces + << " --validate Check that the coloring is a " + "valid distance-2 graph coloring" + << std::endl + << spaces + << " --verbose-level Set verbosity level [0..5] " + "where N > 0 means print verbose messags." + << std::endl + << spaces << " Default: 0" + << std::endl + << spaces + << " --help Print out command line help." + << std::endl + << spaces << " " << std::endl; } - -int -parse_inputs(KokkosKernels::Example::Parameters& params, int argc, char** argv) -{ - bool got_required_param_amtx = false; - bool got_required_param_algorithm = false; - - for(int i = 1; i < argc; ++i) - { - if(0 == strcasecmp(argv[ i ], "--threads")) - { - params.use_threads = atoi(argv[ ++i ]); - //std::cout << "use_threads = " << params.use_threads << std::endl; - } - else if(0 == strcasecmp(argv[ i ], "--serial")) - { - params.use_serial = atoi(argv[ ++i ]); - //std::cout << "use_serial = " << params.use_serial << std::endl; - } - else if(0 == strcasecmp(argv[ i ], "--openmp")) - { - params.use_openmp = atoi(argv[ ++i ]); - //std::cout << "use_openmp = " << params.use_openmp << std::endl; - } - else if(0 == strcasecmp(argv[ i ], "--cuda")) - { - params.use_cuda = 1; - //std::cout << "use_cuda = " << params.use_cuda << std::endl; - } - else if(0 == strcasecmp(argv[ i ], "--amtx")) - { - got_required_param_amtx = true; - params.mtx_bin_file = argv[ ++i ]; - } - else if(0 == strcasecmp(argv[ i ], "--validate")) - { - params.validate = 1; - } - else if(0 == strcasecmp(argv[ i ], "--verbose-level")) - { - params.verbose_level = atoi( argv[++i] ); - params.verbose_level = std::min(5, params.verbose_level); - params.verbose_level = std::max(0, params.verbose_level); - } - else if(0 == strcasecmp(argv[ i ], "--output-histogram")) - { - params.output_histogram = 1; - } - else if(0 == strcasecmp(argv[ i ], "--output-graphviz")) - { - params.output_graphviz = 1; - } - else if(0 == strcasecmp(argv[ i ], "--output-graphviz-vert-max")) - { - params.output_graphviz_vert_max = atoi( argv[++i] ); - } - else if(0 == strcasecmp(argv[ i ], "--algorithm")) - { - ++i; - if(0 == strcasecmp(argv[ i ], "COLORING_D2_MATRIX_SQUARED")) - { - params.algorithm = 1; - got_required_param_algorithm = true; - } - else if(0 == strcasecmp(argv[ i ], "COLORING_D2_SERIAL")) - { - params.algorithm = 2; - got_required_param_algorithm = true; - } - else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB") || 0 == strcasecmp(argv[ i ], "COLORING_D2")) - { - params.algorithm = 3; - got_required_param_algorithm = true; - } - else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB_BIT")) - { - params.algorithm = 4; - got_required_param_algorithm = true; - } - else if(0 == strcasecmp(argv[ i ], "COLORING_D2_VB_BIT_EF")) - { - params.algorithm = 5; - got_required_param_algorithm = true; - } - else - { - std::cerr << "2-Unrecognized command line argument #" << i << ": " << argv[ i ] << std::endl; - print_options(std::cout, argv[ 0 ]); - return 1; - } - } - else if(0 == strcasecmp(argv[ i ], "--help") || 0 == strcasecmp(argv[ i ], "-h")) - { - print_options(std::cout, argv[ 0 ]); - return 1; - } - else - { - std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[ i ] << std::endl; - print_options(std::cout, argv[ 0 ]); - return 1; - } - } - - if(!got_required_param_amtx) - { - std::cout << "Missing required parameter amtx" << std::endl << std::endl; - print_options(std::cout, argv[ 0 ]); - return 1; - } - if(!got_required_param_algorithm) - { - std::cout << "Missing required parameter algorithm" << std::endl << std::endl; - print_options(std::cout, argv[ 0 ]); - return 1; - } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) - { - print_options(std::cout, argv[ 0 ]); +int parse_inputs(KokkosKernels::Example::Parameters& params, int argc, + char** argv) { + bool got_required_param_amtx = false; + bool got_required_param_algorithm = false; + + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "--threads")) { + params.use_threads = atoi(argv[++i]); + // std::cout << "use_threads = " << params.use_threads << std::endl; + } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) { + params.use_serial = atoi(argv[++i]); + // std::cout << "use_serial = " << params.use_serial << std::endl; + } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { + params.use_openmp = atoi(argv[++i]); + // std::cout << "use_openmp = " << params.use_openmp << std::endl; + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = 1; + // std::cout << "use_cuda = " << params.use_cuda << std::endl; + } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + got_required_param_amtx = true; + params.mtx_bin_file = argv[++i]; + } else if (0 == Test::string_compare_no_case(argv[i], "--validate")) { + params.validate = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--verbose-level")) { + params.verbose_level = atoi(argv[++i]); + params.verbose_level = std::min(5, params.verbose_level); + params.verbose_level = std::max(0, params.verbose_level); + } else if (0 == + Test::string_compare_no_case(argv[i], "--output-histogram")) { + params.output_histogram = 1; + } else if (0 == + Test::string_compare_no_case(argv[i], "--output-graphviz")) { + params.output_graphviz = 1; + } else if (0 == Test::string_compare_no_case( + argv[i], "--output-graphviz-vert-max")) { + params.output_graphviz_vert_max = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { + ++i; + if (0 == + Test::string_compare_no_case(argv[i], "COLORING_D2_MATRIX_SQUARED")) { + params.algorithm = 1; + got_required_param_algorithm = true; + } else if (0 == + Test::string_compare_no_case(argv[i], "COLORING_D2_SERIAL")) { + params.algorithm = 2; + got_required_param_algorithm = true; + } else if (0 == Test::string_compare_no_case(argv[i], "COLORING_D2_VB") || + 0 == Test::string_compare_no_case(argv[i], "COLORING_D2")) { + params.algorithm = 3; + got_required_param_algorithm = true; + } else if (0 == + Test::string_compare_no_case(argv[i], "COLORING_D2_VB_BIT")) { + params.algorithm = 4; + got_required_param_algorithm = true; + } else if (0 == Test::string_compare_no_case(argv[i], + "COLORING_D2_VB_BIT_EF")) { + params.algorithm = 5; + got_required_param_algorithm = true; + } else { + std::cerr << "2-Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(std::cout, argv[0]); return 1; + } + } else if (0 == Test::string_compare_no_case(argv[i], "--help") || + 0 == Test::string_compare_no_case(argv[i], "-h")) { + print_options(std::cout, argv[0]); + return 1; + } else { + std::cerr << "3-Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(std::cout, argv[0]); + return 1; } - return 0; -} - - - -template -void -run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) -{ - using namespace KokkosGraph; - using namespace KokkosGraph::Experimental; - - int algorithm = params.algorithm; - int shmemsize = params.shmemsize; - - using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; - using lno_nnz_view_type = typename CrsGraph_type::entries_type::non_const_type; - using size_type = typename lno_view_type::non_const_value_type; - using lno_type = typename lno_nnz_view_type::non_const_value_type; - using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle; - - - // Create a kernel handle - KernelHandle_type kh; - kh.set_shmem_size(shmemsize); - - if(params.verbose_level > 0) - { - kh.set_verbose(true); - } - - // ------------------------------------------ - // Set up the D2 coloring kernel handle - // ------------------------------------------ - std::string label_algorithm; - switch(algorithm) - { - case 1: - kh.create_distance2_graph_coloring_handle(COLORING_D2_MATRIX_SQUARED); - label_algorithm = "COLORING_D2_MATRIX_SQUARED"; - break; - case 2: - kh.create_distance2_graph_coloring_handle(COLORING_D2_SERIAL); - label_algorithm = "COLORING_D2_SERIAL"; - break; - case 3: - kh.create_distance2_graph_coloring_handle(COLORING_D2_VB); - label_algorithm = "COLORING_D2_VB"; - break; - case 4: - kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT); - label_algorithm = "COLORING_D2_VB_BIT"; - break; - case 5: - kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT_EF); - label_algorithm = "COLORING_D2_VB_BIT_EF"; - break; - default: - kh.create_distance2_graph_coloring_handle(COLORING_D2_VB); - label_algorithm = "COLORING_D2_VB"; - break; - } - - std::cout << std::endl << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; - - // ------------------------------------------ - // Call the distance-2 graph coloring routine - // ------------------------------------------ - graph_compute_distance2_color(&kh, - crsGraph.numRows(), - num_cols, - crsGraph.row_map, - crsGraph.entries, - crsGraph.row_map, - crsGraph.entries); - - // ------------------------------------------ - // Get the results - // ------------------------------------------ - size_t num_colors = kh.get_distance2_graph_coloring_handle()->get_num_colors(); - size_t num_phases = kh.get_distance2_graph_coloring_handle()->get_num_phases(); - - if(params.verbose_level > 0) - { - std::cout << "Total Time: " << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() << std::endl - << "Num colors: " << kh.get_distance2_graph_coloring_handle()->get_num_colors() << std::endl - << "Num Phases: " << kh.get_distance2_graph_coloring_handle()->get_num_phases() << std::endl - << "Colors:\n\t"; - KokkosKernels::Impl::print_1Dview(kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); - std::cout << std::endl; - } - - // ------------------------------------------ - // Save coloring to a GraphViz file - // ------------------------------------------ - if(params.output_graphviz && crsGraph.numRows() <= params.output_graphviz_vert_max) - { - auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors(); - - std::ofstream os("G.dot", std::ofstream::out); - - kh.get_distance2_graph_coloring_handle()->dump_graphviz(os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors); - } - - // ------------------------------------------ - // Verify correctness - // ------------------------------------------ - std::string str_color_is_valid = "UNKNOWN"; - if(0 != params.validate) - { - str_color_is_valid = "VALID"; - - bool d2_coloring_is_valid = false; - bool d2_coloring_validation_flags[ 4 ] = {false}; - - d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color(&kh, - crsGraph.numRows(), - //crsGraph.numCols(), - num_cols, - crsGraph.row_map, - crsGraph.entries, - crsGraph.row_map, - crsGraph.entries, - d2_coloring_validation_flags); - - // Print out messages based on coloring validation check. - if(d2_coloring_is_valid) - { - std::cout << std::endl << "Distance-2 Graph Coloring is VALID" << std::endl << std::endl; - } - else - { - str_color_is_valid = "INVALID"; - std::cout << std::endl - << "Distance-2 Graph Coloring is NOT VALID" << std::endl - << " - Vert(s) left uncolored : " << d2_coloring_validation_flags[ 1 ] << std::endl - << " - Invalid D2 Coloring : " << d2_coloring_validation_flags[ 2 ] << std::endl - << std::endl; - } - if(d2_coloring_validation_flags[ 3 ]) - { - std::cout << "Distance-2 Graph Coloring may have poor quality." << std::endl - << " - Vert(s) have high color value : " << d2_coloring_validation_flags[ 3 ] << std::endl - << std::endl; - } - } - - // ------------------------------------------ - // Print out a histogram of the colors - // ------------------------------------------ - if(0 != params.output_histogram) - { - KokkosGraph::Impl::graph_print_distance2_color_histogram(&kh, - crsGraph.numRows(), - num_cols, - crsGraph.row_map, - crsGraph.entries, - crsGraph.row_map, - crsGraph.entries, - false); - } - - // ------------------------------------------ - // Print out a summary - // ------------------------------------------ - std::string mtx_bin_file = params.mtx_bin_file; - mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); - - std::cout << "Summary" << std::endl - << "-------" << std::endl - << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() << std::endl - << " Filename : " << mtx_bin_file << std::endl - << " Num Verts : " << crsGraph.numRows() << std::endl - << " Num Edges : " << crsGraph.entries.extent(0) << std::endl - << " Concurrency : " << Kokkos::DefaultExecutionSpace::concurrency() << std::endl - << " Algorithm : " << label_algorithm << std::endl - << "Coloring Stats" << std::endl - << " Num colors : " << num_colors << std::endl - << " Num Phases : " << num_phases << std::endl - << " Validation : " << str_color_is_valid << std::endl + } + + if (!got_required_param_amtx) { + std::cout << "Missing required parameter amtx" << std::endl << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + if (!got_required_param_algorithm) { + std::cout << "Missing required parameter algorithm" << std::endl << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + if (!params.use_serial && !params.use_threads && !params.use_openmp && + !params.use_cuda) { + print_options(std::cout, argv[0]); + return 1; + } + return 0; +} -} // run_example() - - - -template -void -driver(Parameters params) -{ - using myExecSpace = exec_space; - using myFastDevice = Kokkos::Device; - using crstmat_type = typename KokkosSparse::CrsMatrix; - using graph_type = typename crstmat_type::StaticCrsGraphType; - using data_type = typename graph_type::data_type; - - char* mat_file = params.mtx_bin_file; - - crstmat_type crsmat = KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); - graph_type crsgraph = crsmat.graph; - data_type num_cols = crsmat.numCols(); - - KokkosKernels::Example::run_example - (crsgraph, num_cols, params); - -} // driver() - - -} // namespace Example -} // namespace KokkosKernels - - - -int -main(int argc, char* argv[]) -{ - KokkosKernels::Example::Parameters params; - - if(parse_inputs(params, argc, argv)) - { - return 1; - } - - if(params.mtx_bin_file == NULL) - { - std::cerr << "Provide a matrix file" << std::endl; - return 0; +template +void run_example(CrsGraph_type crsGraph, DataType num_cols, Parameters params) { + using namespace KokkosGraph; + using namespace KokkosGraph::Experimental; + + int algorithm = params.algorithm; + int shmemsize = params.shmemsize; + + using lno_view_type = typename CrsGraph_type::row_map_type::non_const_type; + using lno_nnz_view_type = + typename CrsGraph_type::entries_type::non_const_type; + using size_type = typename lno_view_type::non_const_value_type; + using lno_type = typename lno_nnz_view_type::non_const_value_type; + using KernelHandle_type = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_type, kk_scalar_type, ExecSpace, TempMemSpace, + PersistentMemSpace>; + + // Create a kernel handle + KernelHandle_type kh; + kh.set_shmem_size(shmemsize); + + if (params.verbose_level > 0) { + kh.set_verbose(true); + } + + // ------------------------------------------ + // Set up the D2 coloring kernel handle + // ------------------------------------------ + std::string label_algorithm; + switch (algorithm) { + case 1: + kh.create_distance2_graph_coloring_handle(COLORING_D2_MATRIX_SQUARED); + label_algorithm = "COLORING_D2_MATRIX_SQUARED"; + break; + case 2: + kh.create_distance2_graph_coloring_handle(COLORING_D2_SERIAL); + label_algorithm = "COLORING_D2_SERIAL"; + break; + case 3: + kh.create_distance2_graph_coloring_handle(COLORING_D2_VB); + label_algorithm = "COLORING_D2_VB"; + break; + case 4: + kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT); + label_algorithm = "COLORING_D2_VB_BIT"; + break; + case 5: + kh.create_distance2_graph_coloring_handle(COLORING_D2_VB_BIT_EF); + label_algorithm = "COLORING_D2_VB_BIT_EF"; + break; + default: + kh.create_distance2_graph_coloring_handle(COLORING_D2_VB); + label_algorithm = "COLORING_D2_VB"; + break; + } + + std::cout << std::endl + << "Run Graph Color D2 (" << label_algorithm << ")" << std::endl; + + // ------------------------------------------ + // Call the distance-2 graph coloring routine + // ------------------------------------------ + graph_compute_distance2_color(&kh, crsGraph.numRows(), num_cols, + crsGraph.row_map, crsGraph.entries, + crsGraph.row_map, crsGraph.entries); + + // ------------------------------------------ + // Get the results + // ------------------------------------------ + size_t num_colors = + kh.get_distance2_graph_coloring_handle()->get_num_colors(); + size_t num_phases = + kh.get_distance2_graph_coloring_handle()->get_num_phases(); + + if (params.verbose_level > 0) { + std::cout + << "Total Time: " + << kh.get_distance2_graph_coloring_handle()->get_overall_coloring_time() + << std::endl + << "Num colors: " + << kh.get_distance2_graph_coloring_handle()->get_num_colors() + << std::endl + << "Num Phases: " + << kh.get_distance2_graph_coloring_handle()->get_num_phases() + << std::endl + << "Colors:\n\t"; + KokkosKernels::Impl::print_1Dview( + kh.get_distance2_graph_coloring_handle()->get_vertex_colors()); + std::cout << std::endl; + } + + // ------------------------------------------ + // Save coloring to a GraphViz file + // ------------------------------------------ + if (params.output_graphviz && + crsGraph.numRows() <= params.output_graphviz_vert_max) { + auto colors = kh.get_distance2_graph_coloring_handle()->get_vertex_colors(); + + std::ofstream os("G.dot", std::ofstream::out); + + kh.get_distance2_graph_coloring_handle()->dump_graphviz( + os, crsGraph.numRows(), crsGraph.row_map, crsGraph.entries, colors); + } + + // ------------------------------------------ + // Verify correctness + // ------------------------------------------ + std::string str_color_is_valid = "UNKNOWN"; + if (0 != params.validate) { + str_color_is_valid = "VALID"; + + bool d2_coloring_is_valid = false; + bool d2_coloring_validation_flags[4] = {false}; + + d2_coloring_is_valid = KokkosGraph::Impl::graph_verify_distance2_color( + &kh, crsGraph.numRows(), + // crsGraph.numCols(), + num_cols, crsGraph.row_map, crsGraph.entries, crsGraph.row_map, + crsGraph.entries, d2_coloring_validation_flags); + + // Print out messages based on coloring validation check. + if (d2_coloring_is_valid) { + std::cout << std::endl + << "Distance-2 Graph Coloring is VALID" << std::endl + << std::endl; + } else { + str_color_is_valid = "INVALID"; + std::cout << std::endl + << "Distance-2 Graph Coloring is NOT VALID" << std::endl + << " - Vert(s) left uncolored : " + << d2_coloring_validation_flags[1] << std::endl + << " - Invalid D2 Coloring : " + << d2_coloring_validation_flags[2] << std::endl + << std::endl; } - - const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads - const int device_id = 0; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); - - // Print out information about the configuration of the run if verbose_level >= 5 - if(params.verbose_level >= 5) - { - Kokkos::print_configuration(std::cout); + if (d2_coloring_validation_flags[3]) { + std::cout << "Distance-2 Graph Coloring may have poor quality." + << std::endl + << " - Vert(s) have high color value : " + << d2_coloring_validation_flags[3] << std::endl + << std::endl; } + } + + // ------------------------------------------ + // Print out a histogram of the colors + // ------------------------------------------ + if (0 != params.output_histogram) { + KokkosGraph::Impl::graph_print_distance2_color_histogram( + &kh, crsGraph.numRows(), num_cols, crsGraph.row_map, crsGraph.entries, + crsGraph.row_map, crsGraph.entries, false); + } + + // ------------------------------------------ + // Print out a summary + // ------------------------------------------ + std::string mtx_bin_file = params.mtx_bin_file; + mtx_bin_file = mtx_bin_file.substr(mtx_bin_file.find_last_of("/\\") + 1); + + std::cout << "Summary" << std::endl + << "-------" << std::endl + << " KExecSName : " << Kokkos::DefaultExecutionSpace::name() + << std::endl + << " Filename : " << mtx_bin_file << std::endl + << " Num Verts : " << crsGraph.numRows() << std::endl + << " Num Edges : " << crsGraph.entries.extent(0) + << std::endl + << " Concurrency : " + << Kokkos::DefaultExecutionSpace::concurrency() << std::endl + << " Algorithm : " << label_algorithm << std::endl + << "Coloring Stats" << std::endl + << " Num colors : " << num_colors << std::endl + << " Num Phases : " << num_phases << std::endl + << " Validation : " << str_color_is_valid << std::endl + << std::endl; + +} // run_example() + +template +void driver(Parameters params) { + using myExecSpace = exec_space; + using myFastDevice = Kokkos::Device; + using crstmat_type = + typename KokkosSparse::CrsMatrix; + using graph_type = typename crstmat_type::StaticCrsGraphType; + using data_type = typename graph_type::data_type; + + char* mat_file = params.mtx_bin_file; + + crstmat_type crsmat = + KokkosKernels::Impl::read_kokkos_crst_matrix(mat_file); + graph_type crsgraph = crsmat.graph; + data_type num_cols = crsmat.numCols(); + + KokkosKernels::Example::run_example( + crsgraph, num_cols, params); + +} // driver() + +} // namespace Example +} // namespace KokkosKernels + +int main(int argc, char* argv[]) { + KokkosKernels::Example::Parameters params; + + if (parse_inputs(params, argc, argv)) { + return 1; + } + + if (params.mtx_bin_file == NULL) { + std::cerr << "Provide a matrix file" << std::endl; + return 0; + } + + const int num_threads = + params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads + const int device_id = 0; + Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + + // Print out information about the configuration of the run if verbose_level + // >= 5 + if (params.verbose_level >= 5) { + Kokkos::print_configuration(std::cout); + } + +#if defined(KOKKOS_ENABLE_OPENMP) + if (params.use_openmp) { + KokkosKernels::Example::driver(params); + } +#endif - #if defined(KOKKOS_ENABLE_OPENMP) - if(params.use_openmp) - { - KokkosKernels::Example::driver(params); - } - #endif - - #if defined(KOKKOS_ENABLE_CUDA) - if(params.use_cuda) - { - KokkosKernels::Example::driver(params); - } - #endif +#if defined(KOKKOS_ENABLE_CUDA) + if (params.use_cuda) { + KokkosKernels::Example::driver(params); + } +#endif - #if defined(KOKKOS_ENABLE_SERIAL) - if(params.use_serial) - { - KokkosKernels::Example::driver(params); - } - #endif +#if defined(KOKKOS_ENABLE_SERIAL) + if (params.use_serial) { + KokkosKernels::Example::driver(params); + } +#endif - Kokkos::finalize(); + Kokkos::finalize(); - return 0; + return 0; } diff --git a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp index 74aa8fb802..9909c55720 100644 --- a/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp +++ b/example/hashmap_accumulator/KokkosKernels_Example_HashmapAccumulator.cpp @@ -65,359 +65,335 @@ #include #include - - // Command Line Parameters structure -typedef struct params -{ - uint32_t use_serial = false; - uint32_t use_threads = false; - uint32_t use_cuda = false; - uint32_t use_openmp = false; - bool verbose = false; - - size_t problem_size = 20; - size_t repeat = 1; +typedef struct params { + uint32_t use_serial = false; + uint32_t use_threads = false; + uint32_t use_cuda = false; + uint32_t use_openmp = false; + bool verbose = false; + + size_t problem_size = 20; + size_t repeat = 1; } parameters_t; - - namespace KokkosKernels { namespace Experiment { +template +struct functorTestHashmapAccumulator { + typedef ExecutionSpace execution_space; + typedef typename Kokkos::View data_view_t; + + const size_t _num_entries; + const data_view_t _data; + uniform_memory_pool_t _memory_pool; + const size_t _hash_size; + const size_t _max_hash_entries; + const parameters_t& _params; + + typedef Kokkos::Experimental::UniqueToken< + execution_space, Kokkos::Experimental::UniqueTokenScope::Global> + unique_token_t; + unique_token_t tokens; + + functorTestHashmapAccumulator(const size_t num_entries, + const data_view_t& data, + uniform_memory_pool_t memory_pool, + const size_t hash_size, + const size_t max_hash_entries, + const parameters_t& params) + : _num_entries(num_entries), + _data(data), + _memory_pool(memory_pool), + _hash_size(hash_size), + _max_hash_entries(max_hash_entries), + _params(params), + tokens(ExecutionSpace()) { + if (_params.verbose) { + std::cout << "UniqueToken.size: " << tokens.size() << std::endl; + } + } - template - struct functorTestHashmapAccumulator - { - typedef ExecutionSpace execution_space; - typedef typename Kokkos::View data_view_t; - - const size_t _num_entries; - const data_view_t _data; - uniform_memory_pool_t _memory_pool; - const size_t _hash_size; - const size_t _max_hash_entries; - const parameters_t& _params; - - typedef Kokkos::Experimental::UniqueToken unique_token_t; - unique_token_t tokens; - - functorTestHashmapAccumulator( const size_t num_entries, - const data_view_t& data, - uniform_memory_pool_t memory_pool, - const size_t hash_size, - const size_t max_hash_entries, - const parameters_t& params) - : _num_entries(num_entries) - , _data(data) - , _memory_pool(memory_pool) - , _hash_size(hash_size) - , _max_hash_entries(max_hash_entries) - , _params(params) - , tokens( ExecutionSpace() ) - { - if(_params.verbose) - { - std::cout << "UniqueToken.size: " << tokens.size() << std::endl; - } - } - - KOKKOS_INLINE_FUNCTION - void operator()(const scalar_t idx) const - { - typedef scalar_t hash_size_type; - typedef scalar_t hash_key_type; - typedef scalar_t hash_value_type; - - // Alternative to team_policy thread id - auto tid = tokens.acquire(); - - // Acquire a chunk from the memory pool using a spin-loop. - volatile scalar_t* ptr_temp = nullptr; - while(nullptr==ptr_temp) - { - ptr_temp = (volatile scalar_t*)(_memory_pool.allocate_chunk(tid)); - } - scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp); - - KokkosKernels::Experimental::HashmapAccumulator hash_map; - - // Set pointer to hash indices - scalar_t* used_hash_indices = (scalar_t*)(ptr_temp); - ptr_temp += _hash_size; - - // Set pointer to hash begins - hash_map.hash_begins = (scalar_t*)(ptr_temp); - ptr_temp += _hash_size; - - // Set pointer to hash nexts - hash_map.hash_nexts = (scalar_t*)(ptr_temp); - ptr_temp += _max_hash_entries; - - // Set pointer to hash keys - hash_map.keys = (scalar_t*)(ptr_temp); - // ptr_temp += _max_hash_entries; - - // Set pointer to hash values - //hash_map.values = (scalar_t*)(ptr_temp); - - // Set up limits in Hashmap_Accumulator - hash_map.hash_key_size = _max_hash_entries; - hash_map.max_value_size = _max_hash_entries; - - // hash function is hash_size-1 (note: hash_size must be a power of 2) - scalar_t hash_func_pow2 = _hash_size-1; - - // These are updated by Hashmap_Accumulator insert functions. - scalar_t used_hash_size = 0; - scalar_t used_hash_count = 0; - - // Loop over stuff - for(size_t i=0; i<_num_entries; i++) - { - scalar_t key = _data(i); - - // Compute the hash index using & instead of % (modulus is slower). - scalar_t hash = key & hash_func_pow2; - - int r = hash_map.sequential_insert_into_hash_TrackHashes(hash, - key, - &used_hash_size, - hash_map.max_value_size, - &used_hash_count, - used_hash_indices); - - // Check return code - if(r) - { - // insert should return nonzero if the insert failed, but for sequential_insert_into_hash_TrackHashes - // the 'full' case is currently ignored, so r will always be 0. - } - } - - // TODO: Get the # of unique values inserted and return that out of the functor. - - // Reset the Begins values to -1 before releasing the memory pool chunk. - // If you don't do this the next thread that grabs this memory chunk will not work properly. - for(scalar_t i=0; i - void experiment(const parameters_t& params) - { - typedef typename KokkosKernels::Impl::UniformMemoryPool uniform_memory_pool_t; - typedef typename Kokkos::View data_view_t; - typedef typename data_view_t::HostMirror data_view_hostmirror_t; - - size_t num_entries = params.problem_size; - - // Set max value in the list - size_t max_value = 100; - - // Get the concurrecny - size_t concurrency = execution_space::concurrency(); - - // Set up random number generator - std::random_device rd; - std::mt19937 eng(rd()); - std::uniform_int_distribution distr(1, max_value); - - // Create a view of random values - data_view_t d_data("data", num_entries); - data_view_hostmirror_t h_data = Kokkos::create_mirror_view(d_data); - - for(size_t i=0; i m_space(mem_chunk_count, mem_chunk_size, -1, pool_type); - uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, pool_type); - - functorTestHashmapAccumulator - testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size, max_hash_entries, params); + KOKKOS_INLINE_FUNCTION + void operator()(const scalar_t idx) const { + typedef scalar_t hash_size_type; + typedef scalar_t hash_key_type; + typedef scalar_t hash_value_type; - Kokkos::parallel_for("testHashmapAccumulator", num_entries, testHashmapAccumulator); + // Alternative to team_policy thread id + auto tid = tokens.acquire(); - if(params.verbose) - { - double t = timer.seconds(); - std::cout << "Execution Time: " << std::setw(-2) << t << std::endl; - timer.reset(); - } + // Acquire a chunk from the memory pool using a spin-loop. + volatile scalar_t* ptr_temp = nullptr; + while (nullptr == ptr_temp) { + ptr_temp = (volatile scalar_t*)(_memory_pool.allocate_chunk(tid)); } + scalar_t* ptr_memory_pool_chunk = (scalar_t*)(ptr_temp); -} // namespace Experiment -} // namespace KokkosKernels - - - -void print_options(std::ostream &os, const char *app_name, unsigned int indent = 0) -{ - std::string spaces(indent, ' '); - os << "Usage:" << std::endl - << spaces << " " << app_name << " [parameters]" << std::endl - << std::endl - << spaces << "Parameters:" << std::endl - << spaces << " Parallelism (select one of the following):" << std::endl - << spaces << " --serial Execute serially." << std::endl - << spaces << " --threads Use N posix threads." << std::endl - << spaces << " --openmp Use OpenMP with N threads." << std::endl - << spaces << " --cuda Use CUDA" << std::endl - << spaces << " Optional Parameters:" << std::endl - << spaces << " --problem-size Problem Size (Default: 20)" << std::endl - << spaces << " --verbose Verbose output" << std::endl - << spaces << " --help Print out command line help." << std::endl - << spaces << " " << std::endl; -} // print_options - - - -//int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, char **argv) -int parse_inputs(parameters_t ¶ms, int argc, char **argv) -{ - if(argc==1) - { - print_options(std::cout, argv[0]); - return 1; - } + KokkosKernels::Experimental::HashmapAccumulator< + hash_size_type, hash_key_type, hash_value_type> + hash_map; + + // Set pointer to hash indices + scalar_t* used_hash_indices = (scalar_t*)(ptr_temp); + ptr_temp += _hash_size; + + // Set pointer to hash begins + hash_map.hash_begins = (scalar_t*)(ptr_temp); + ptr_temp += _hash_size; + + // Set pointer to hash nexts + hash_map.hash_nexts = (scalar_t*)(ptr_temp); + ptr_temp += _max_hash_entries; + + // Set pointer to hash keys + hash_map.keys = (scalar_t*)(ptr_temp); + // ptr_temp += _max_hash_entries; + + // Set pointer to hash values + // hash_map.values = (scalar_t*)(ptr_temp); + + // Set up limits in Hashmap_Accumulator + hash_map.hash_key_size = _max_hash_entries; + hash_map.max_value_size = _max_hash_entries; + + // hash function is hash_size-1 (note: hash_size must be a power of 2) + scalar_t hash_func_pow2 = _hash_size - 1; + + // These are updated by Hashmap_Accumulator insert functions. + scalar_t used_hash_size = 0; + scalar_t used_hash_count = 0; - for(int i = 1; i < argc; ++i) - { - if(0 == strcasecmp(argv[i], "--threads")) - { - params.use_threads = atoi(argv[++i]); - } - else if(0 == strcasecmp(argv[i], "--serial")) - { - params.use_serial = atoi(argv[++i]); - } - else if(0 == strcasecmp(argv[i], "--openmp")) - { - params.use_openmp = atoi(argv[++i]); - } - else if(0 == strcasecmp(argv[i], "--cuda")) - { - params.use_cuda = 1; - } - else if (0 == strcasecmp(argv[i], "--repeat")) - { - params.repeat = atoi(argv[++i]); - } - else if (0 == strcasecmp(argv[i], "--problem-size")) - { - params.problem_size = atoi(argv[++i]); - } - else if(0 == strcasecmp(argv[i], "--verbose") || 0 == strcasecmp(argv[i], "-V") ) - { - params.verbose = true; - } - else if(0 == strcasecmp(argv[i], "help") || 0 == strcasecmp(argv[i], "-h")) - { - print_options(std::cout, argv[0]); - return 1; - } - else - { - std::cerr << "3-Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; - print_options(std::cout, argv[0]); - return 1; - } + // Loop over stuff + for (size_t i = 0; i < _num_entries; i++) { + scalar_t key = _data(i); + + // Compute the hash index using & instead of % (modulus is slower). + scalar_t hash = key & hash_func_pow2; + + int r = hash_map.sequential_insert_into_hash_TrackHashes( + hash, key, &used_hash_size, hash_map.max_value_size, &used_hash_count, + used_hash_indices); + + // Check return code + if (r) { + // insert should return nonzero if the insert failed, but for + // sequential_insert_into_hash_TrackHashes the 'full' case is currently + // ignored, so r will always be 0. + } } - if(!params.use_serial && !params.use_threads && !params.use_openmp && !params.use_cuda) - { - print_options(std::cout, argv[0]); - return 1; + + // TODO: Get the # of unique values inserted and return that out of the + // functor. + + // Reset the Begins values to -1 before releasing the memory pool chunk. + // If you don't do this the next thread that grabs this memory chunk will + // not work properly. + for (scalar_t i = 0; i < used_hash_count; i++) { + scalar_t dirty_hash = used_hash_indices[i]; + hash_map.hash_begins[dirty_hash] = -1; } - return 0; -} // parse_inputs + // Release the memory pool chunk back to the pool + _memory_pool.release_chunk(ptr_memory_pool_chunk); + // Release the UniqueToken + tokens.release(tid); -int main(int argc, char *argv[]) -{ - //KokkosKernels::Experiment::Parameters params; - parameters_t params; + } // operator() - // Override default repeats (default is 6) - params.repeat = 1; +}; // functorTestHashmapAccumulator - if(parse_inputs(params, argc, argv)) - { - return 1; - } +template +void experiment(const parameters_t& params) { + typedef + typename KokkosKernels::Impl::UniformMemoryPool + uniform_memory_pool_t; + typedef typename Kokkos::View data_view_t; + typedef typename data_view_t::HostMirror data_view_hostmirror_t; - const int device_id = 0; - const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads + size_t num_entries = params.problem_size; - Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + // Set max value in the list + size_t max_value = 100; - if(params.verbose) - { - Kokkos::print_configuration(std::cout); - } + // Get the concurrecny + size_t concurrency = execution_space::concurrency(); - // Work goes here. - KokkosKernels::Experiment::experiment(params); + // Set up random number generator + std::random_device rd; + std::mt19937 eng(rd()); + std::uniform_int_distribution distr(1, max_value); - Kokkos::finalize(); - std::cout << "Done." << std::endl; - return 0; + // Create a view of random values + data_view_t d_data("data", num_entries); + data_view_hostmirror_t h_data = Kokkos::create_mirror_view(d_data); + + for (size_t i = 0; i < num_entries; i++) { + h_data(i) = distr(eng); + } + + // Print out the array of random numbers if the list size is small. + if (num_entries <= 50 || params.verbose) { + std::cout << "Data: "; + for (size_t i = 0; i < num_entries; i++) { + std::cout << h_data(i) << " "; + } + std::cout << std::endl; + } + + Kokkos::Timer timer; + + // Deep copy initialized values to device memory. + Kokkos::deep_copy(d_data, h_data); + + // Set Hash Table Parameters + size_t max_hash_entries = max_value; // Max number of entries that can be + // inserted (values allowed are 1..100) + size_t hash_size_hint = + max_value; // How many hash keys are allowed. The actual hash size will + // be set to the next power of 2 bigger than hash_size_hint. + + // Set the hash_size as the next power of 2 bigger than hash_size_hint. + // - hash_size must be a power of two since we use & rather than % (which is + // slower) for computing the hash value for HashmapAccumulator. + size_t hash_size = 1; + while (hash_size < hash_size_hint) { + hash_size *= 2; + } + + // Create Uniform Initialized Memory Pool + KokkosKernels::Impl::PoolType pool_type = + KokkosKernels::Impl::OneThread2OneChunk; + + // Determine memory chunk size for UniformMemoryPool + size_t mem_chunk_size = hash_size; // for hash indices + mem_chunk_size += hash_size; // for hash begins + mem_chunk_size += max_hash_entries; // for hash nexts + mem_chunk_size += max_hash_entries; // for hash keys + // mem_chunk_size += max_entries; // for hash values + + // Set a cap on # of chunks to 32. In application something else should be + // done here differently if we're OpenMP vs. GPU but for this example we can + // just cap our number of chunks at 32. + size_t mem_chunk_count = KOKKOSKERNELS_MACRO_MIN(32, concurrency); + + // KokkosKernels::Impl::UniformMemoryPool m_space(mem_chunk_count, mem_chunk_size, -1, pool_type); + uniform_memory_pool_t memory_pool(mem_chunk_count, mem_chunk_size, -1, + pool_type); + + functorTestHashmapAccumulator + testHashmapAccumulator(num_entries, d_data, memory_pool, hash_size, + max_hash_entries, params); + + Kokkos::parallel_for("testHashmapAccumulator", num_entries, + testHashmapAccumulator); + + if (params.verbose) { + double t = timer.seconds(); + std::cout << "Execution Time: " << std::setw(-2) << t << std::endl; + timer.reset(); + } +} + +} // namespace Experiment +} // namespace KokkosKernels + +void print_options(std::ostream& os, const char* app_name, + unsigned int indent = 0) { + std::string spaces(indent, ' '); + os << "Usage:" << std::endl + << spaces << " " << app_name << " [parameters]" << std::endl + << std::endl + << spaces << "Parameters:" << std::endl + << spaces << " Parallelism (select one of the following):" << std::endl + << spaces << " --serial Execute serially." << std::endl + << spaces << " --threads Use N posix threads." << std::endl + << spaces << " --openmp Use OpenMP with N threads." + << std::endl + << spaces << " --cuda Use CUDA" << std::endl + << spaces << " Optional Parameters:" << std::endl + << spaces << " --problem-size Problem Size (Default: 20)" + << std::endl + << spaces << " --verbose Verbose output" << std::endl + << spaces << " --help Print out command line help." + << std::endl + << spaces << " " << std::endl; +} // print_options + +// int parse_inputs(KokkosKernels::Experiment::Parameters ¶ms, int argc, +// char **argv) +int parse_inputs(parameters_t& params, int argc, char** argv) { + if (argc == 1) { + print_options(std::cout, argv[0]); + return 1; + } + + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "--threads")) { + params.use_threads = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--serial")) { + params.use_serial = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { + params.use_openmp = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + params.repeat = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--problem-size")) { + params.problem_size = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--verbose") || + 0 == Test::string_compare_no_case(argv[i], "-V")) { + params.verbose = true; + } else if (0 == Test::string_compare_no_case(argv[i], "help") || + 0 == Test::string_compare_no_case(argv[i], "-h")) { + print_options(std::cout, argv[0]); + return 1; + } else { + std::cerr << "3-Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(std::cout, argv[0]); + return 1; + } + } + if (!params.use_serial && !params.use_threads && !params.use_openmp && + !params.use_cuda) { + print_options(std::cout, argv[0]); + return 1; + } + return 0; +} // parse_inputs + +int main(int argc, char* argv[]) { + // KokkosKernels::Experiment::Parameters params; + parameters_t params; + + // Override default repeats (default is 6) + params.repeat = 1; + + if (parse_inputs(params, argc, argv)) { + return 1; + } + + const int device_id = 0; + const int num_threads = + params.use_openmp; // Assumption is that use_openmp variable is provided + // as number of threads + + Kokkos::initialize(Kokkos::InitArguments(num_threads, -1, device_id)); + + if (params.verbose) { + Kokkos::print_configuration(std::cout); + } + + // Work goes here. + KokkosKernels::Experiment::experiment(params); + + Kokkos::finalize(); + std::cout << "Done." << std::endl; + return 0; } diff --git a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp index f4a0026de1..9fa6adb484 100644 --- a/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_coarsening.cpp @@ -1,23 +1,23 @@ #include "KokkosGraph_wiki_9pt_stencil.hpp" #include "KokkosGraph_MIS2.hpp" -int main() -{ +int main() { Kokkos::initialize(); { using GraphDemo::numVertices; RowmapType rowmapDevice; ColindsType colindsDevice; - //Step 1: Generate the graph on host, allocate space on device, and copy. - //See function "generate9pt" below. + // Step 1: Generate the graph on host, allocate space on device, and copy. + // See function "generate9pt" below. GraphDemo::generate9pt(rowmapDevice, colindsDevice); - //Step 2: Run MIS-2 based coarsening and print the result + // Step 2: Run MIS-2 based coarsening and print the result { std::cout << "Coarsened vertex labels:\n"; Ordinal numClusters = 0; - auto labels = KokkosGraph::Experimental::graph_mis2_coarsen( - rowmapDevice, colindsDevice, numClusters, KokkosGraph::MIS2_FAST); - //coarsening labels can be printed in the same way as colors + auto labels = + KokkosGraph::graph_mis2_aggregate( + rowmapDevice, colindsDevice, numClusters); + // coarsening labels can be printed in the same way as colors GraphDemo::printColoring(labels, numClusters); putchar('\n'); } @@ -25,4 +25,3 @@ int main() Kokkos::finalize(); return 0; } - diff --git a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp index 466d506170..2b56af9c96 100644 --- a/example/wiki/graph/KokkosGraph_wiki_mis2.cpp +++ b/example/wiki/graph/KokkosGraph_wiki_mis2.cpp @@ -1,29 +1,32 @@ #include "KokkosGraph_wiki_9pt_stencil.hpp" #include "KokkosGraph_MIS2.hpp" -int main() -{ +int main() { Kokkos::initialize(); { using GraphDemo::numVertices; RowmapType rowmapDevice; ColindsType colindsDevice; - //Step 1: Generate the graph on host, allocate space on device, and copy. - //See function "generate9pt" below. + // Step 1: Generate the graph on host, allocate space on device, and copy. + // See function "generate9pt" below. GraphDemo::generate9pt(rowmapDevice, colindsDevice); - //Step 2: Run distance-2 MIS and print the results, with three different algorithms + // Step 2: Run distance-2 MIS and print the results, with three different + // algorithms { - //Run coloring - auto misDevice = KokkosGraph::Experimental::graph_d2_mis( - rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); + // Run coloring + auto misDevice = + KokkosGraph::graph_d2_mis( + rowmapDevice, colindsDevice, KokkosGraph::MIS2_FAST); std::cout << "Distance-2 MIS, FAST algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + << misDevice.extent(0) << " out of " << GraphDemo::numVertices + << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); - misDevice = KokkosGraph::Experimental::graph_d2_mis( + misDevice = KokkosGraph::graph_d2_mis( rowmapDevice, colindsDevice, KokkosGraph::MIS2_QUALITY); std::cout << "Distance-2 MIS, QUALITY algorithm: contains " - << misDevice.extent(0) << " out of " << GraphDemo::numVertices << " vertices.\n"; + << misDevice.extent(0) << " out of " << GraphDemo::numVertices + << " vertices.\n"; GraphDemo::printMIS(misDevice); putchar('\n'); } @@ -31,4 +34,3 @@ int main() Kokkos::finalize(); return 0; } - diff --git a/install_test/CMakeLists.txt.in b/install_test/CMakeLists.txt.in index 74605ac73f..edf6c2cc1a 100644 --- a/install_test/CMakeLists.txt.in +++ b/install_test/CMakeLists.txt.in @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.13) +cmake_minimum_required(VERSION 3.16) project(kokkoskernels_install_test CXX) include(CTest) diff --git a/master_history.txt b/master_history.txt index 652150a31f..222913d92c 100644 --- a/master_history.txt +++ b/master_history.txt @@ -15,3 +15,4 @@ tag: 3.3.01 date: 01/18/2021 master: f64b1c57 release: 4e1cc00b tag: 3.4.00 date: 04/26/2021 master: fe439b21 release: d3c33910 tag: 3.4.01 date: 05/20/2021 master: 564dccb3 release: 4c62eb86 tag: 3.5.00 date: 11/19/2021 master: 00189c0b release: f171533d +tag: 3.6.00 date: 04/06/2022 master: 8381db04 release: a7e683c4 diff --git a/perf_test/CMakeLists.txt b/perf_test/CMakeLists.txt index d9ec2a34d9..91dc727867 100644 --- a/perf_test/CMakeLists.txt +++ b/perf_test/CMakeLists.txt @@ -35,6 +35,7 @@ if (KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) blas/blas2/KokkosBlas2_gemv_tracked_perf_test.cpp blas/blas1/KokkosBlas_dot_tracked_perf_test.cpp blas/blas1/KokkosBlas_team_dot_tracked_perf_test.cpp + blas/blas3/KokkosBlas3_gemm_tracked_perf_test.cpp PerfTestUtilities.cpp sparse/spmv/OpenMPSmartStatic_SPMV.cpp #sparse / KokkosSparse_spgemm_test.cpp diff --git a/perf_test/KokkosKernelsTrackedTesting.cpp b/perf_test/KokkosKernelsTrackedTesting.cpp index ffb7f98447..10fb834270 100644 --- a/perf_test/KokkosKernelsTrackedTesting.cpp +++ b/perf_test/KokkosKernelsTrackedTesting.cpp @@ -9,6 +9,7 @@ // For RPS version of BLAS Level-1 Tests #include "blas/blas1/tracked_testing.hpp" #include "blas/blas2/tracked_testing.hpp" +#include "blas/blas3/tracked_testing.hpp" int main(int argc, char* argv[]) { { // argument parsing for setting input data at runtime @@ -55,6 +56,8 @@ int main(int argc, char* argv[]) { test::blas2::build_blas2_executor(exec, argc, argv, run_params); + test::blas3::build_blas3_executor(exec, argc, argv, run_params); + exec.setupSuite(); // STEP 3: Report suite run summary diff --git a/perf_test/PerfTestUtilities.hpp b/perf_test/PerfTestUtilities.hpp index 828c0d285a..743df53502 100644 --- a/perf_test/PerfTestUtilities.hpp +++ b/perf_test/PerfTestUtilities.hpp @@ -46,8 +46,9 @@ inline std::vector get_directories(std::string path) { while ((dir = readdir(d)) != NULL) { std::string nname = std::string(dir->d_name); // Check to see if item is a directory - //if (isDirectory(path + '/' + nname)) - if(nname != "." && nname != ".." && isDirectory(path + '/' + dir->d_name)) + // if (isDirectory(path + '/' + nname)) + if (nname != "." && nname != ".." && + isDirectory(path + '/' + dir->d_name)) // std::vector::emplace_back: insert a new element to the end of vector paths.emplace_back(dir->d_name); } diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp index 2930aa4e79..50f15cf719 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Cuda.cpp @@ -22,7 +22,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #define __KOKKOSBATCHED_TEST_ENABLE_CUDA__ #include "KokkosBatched_Util.hpp" @@ -30,16 +30,16 @@ #define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1 //#define KOKKOSBATCHED_USE_BLOCKED_ALGO 1 -#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm; typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv; typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv; #endif -#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Blocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Blocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm; @@ -51,8 +51,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv; using namespace KokkosBatched; -int main (int argc, char *argv[]) { - Kokkos::initialize(argc, argv); +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); typedef Kokkos::DefaultExecutionSpace DeviceSpaceType; @@ -60,40 +60,53 @@ int main (int argc, char *argv[]) { Kokkos::print_configuration(std::cout, detail); - enum : int { VectorLength = DefaultVectorLength::value, - RangeTagOper = 0, - TeamTagOper = 1 }; - + enum : int { + VectorLength = + DefaultVectorLength::value, + RangeTagOper = 0, + TeamTagOper = 1 + }; + // Unit tests bool profile = false; - for (int i=1;i( 3, 4, 2, 25, 2); - // Test::run(44, 63, 15, 4, 1); - // Test::run( 2, 2, 15, 3, 3); - // Test::run( 1, 1, 2, 63, 8); - + // Test::run( + // 3, 4, 2, 25, 2); + // Test::run(44, + // 63, 15, 4, 1); + // Test::run( + // 2, 2, 15, 3, 3); + // Test::run( + // 1, 1, 2, 63, 8); + // for (int nrhs=1;nrhs<=33;++nrhs) - // Test::run(2, 2, 15, 3, nrhs); + // Test::run(2, + // 2, 15, 3, nrhs); // } // std::cout << " Unit Test::Range :: End\n"; - + std::cout << " Unit Test::Team :: Begin\n"; { - Test::run( 3, 4, 2, 25, 2); - Test::run(44, 63, 15, 4, 1); - Test::run( 2, 2, 15, 3, 3); - Test::run( 1, 1, 2, 63, 8); - - for (int nrhs=1;nrhs<=33;++nrhs) - Test::run(2, 2, 15, 3, nrhs); + Test::run( + 3, 4, 2, 25, 2); + Test::run( + 44, 63, 15, 4, 1); + Test::run( + 2, 2, 15, 3, 3); + Test::run( + 1, 1, 2, 63, 8); + + for (int nrhs = 1; nrhs <= 33; ++nrhs) + Test::run(2, 2, 15, 3, nrhs); } std::cout << " Unit Test::Team :: End\n"; } @@ -101,9 +114,9 @@ int main (int argc, char *argv[]) { // Performance tests std::cout << " Perf Test:: Begin\n"; { - const Test::Input input(argc, argv); - Test::run(input); - } + const Test::Input input(argc, argv); + Test::run(input); + } std::cout << " Perf Test:: End\n"; Kokkos::finalize(); diff --git a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp index f682e1e119..1319fa03db 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockCrs_Host.cpp @@ -27,16 +27,16 @@ //#define KOKKOSBATCHED_USE_UNBLOCKED_ALGO 1 #define KOKKOSBATCHED_USE_BLOCKED_ALGO 1 -#if defined (KOKKOSBATCHED_USE_UNBLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_UNBLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Unblocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Unblocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Unblocked AlgoGemm; typedef KokkosBatched::Algo::Trsv::Unblocked AlgoTrsv; typedef KokkosBatched::Algo::Gemv::Unblocked AlgoGemv; #endif -#if defined (KOKKOSBATCHED_USE_BLOCKED_ALGO) -typedef KokkosBatched::Algo::LU::Blocked AlgoLU; +#if defined(KOKKOSBATCHED_USE_BLOCKED_ALGO) +typedef KokkosBatched::Algo::LU::Blocked AlgoLU; typedef KokkosBatched::Algo::Trsm::Blocked AlgoTrsm; typedef KokkosBatched::Algo::Gemm::Blocked AlgoGemm; @@ -48,8 +48,8 @@ typedef KokkosBatched::Algo::Gemv::Blocked AlgoGemv; using namespace KokkosBatched; -int main (int argc, char *argv[]) { - Kokkos::initialize(argc, argv); +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -57,53 +57,60 @@ int main (int argc, char *argv[]) { Kokkos::print_configuration(std::cout, detail); - enum : int { VectorLength = DefaultVectorLength::value, - RangeTagOper = 0 }; + enum : int { + VectorLength = + DefaultVectorLength::value, + RangeTagOper = 0 + }; // vector type - typedef Vector,VectorLength> VectorType; + typedef Vector, VectorLength> VectorType; // Unit tests bool profile = false; - for (int i=1;i( 3, 4, 2, 25, 2); - Test::run(44, 63, 15, 4, 1); - Test::run( 2, 2, 15, 3, 3); - - for (int nrhs=1;nrhs<=33;++nrhs) - Test::run(2, 2, 15, 3, nrhs); + Test::run(3, 4, 2, + 25, 2); + Test::run( + 44, 63, 15, 4, 1); + Test::run(2, 2, 15, + 3, 3); + + for (int nrhs = 1; nrhs <= 33; ++nrhs) + Test::run( + 2, 2, 15, 3, nrhs); } std::cout << " Unit Test::Range::Vector :: End\n"; } - + // MKL #if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) std::cout << " Perf Test::CompactMKL Begin\n"; { const bool test_mkl = true; - const Test::Input input(argc, argv); - Test::run(input, test_mkl); - } - std::cout << " Perf Test::CompactMKL End\n"; + const Test::Input input(argc, argv); + Test::run(input, test_mkl); + } + std::cout << " Perf Test::CompactMKL End\n"; #endif // Performance tests std::cout << " Perf Test::Vector Begin\n"; { - const Test::Input input(argc, argv); - Test::run(input); - } + const Test::Input input(argc, argv); + Test::run(input); + } std::cout << " Perf Test::Vector End\n"; #endif diff --git a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp index cd2e0015a0..f3237d9b4f 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockJacobi_Tutorial.cpp @@ -3,16 +3,15 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" - -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKJACOBI -#endif +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#define KOKKOSBATCHED_TEST_BLOCKJACOBI +#endif #endif #endif -#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) +#if defined(KOKKOSBATCHED_TEST_BLOCKJACOBI) /// KokkosKernels headers #include "KokkosBatched_Util.hpp" @@ -35,57 +34,52 @@ #include "cuda_profiler_api.h" #endif - -using exec_space_type = Kokkos::DefaultExecutionSpace; +using exec_space_type = Kokkos::DefaultExecutionSpace; using memory_space_type = typename exec_space_type::memory_space; -using host_space = Kokkos::DefaultHostExecutionSpace; +using host_space = Kokkos::DefaultHostExecutionSpace; -using val_type = double; +using val_type = double; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using namespace KokkosBatched; -template -val_type computeResidual(const ManyMatrixType &A, - const ManyVectorType &x, - const ManyVectorType &b, - const ManyVectorType &r) { +template +val_type computeResidual(const ManyMatrixType &A, const ManyVectorType &x, + const ManyVectorType &b, const ManyVectorType &r) { /// compute residual val_type residual(0); { - policy_type policy(A.extent(0), Kokkos::AUTO()); + policy_type policy(A.extent(0), Kokkos::AUTO()); Kokkos::deep_copy(r, b); - Kokkos::parallel_reduce - ("compute-residual", - policy, KOKKOS_LAMBDA(const member_type &member, val_type &update) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto rr = Kokkos::subview(r, i, Kokkos::ALL()); - - TeamGemv - ::invoke(member, -one, AA, xx, one, rr); - - val_type sum(0); - Kokkos::parallel_reduce - (Kokkos::TeamThreadRange(member, rr.extent(0)), - [&](const int &k, val_type &lsum) { - lsum += Kokkos::ArithTraits::abs(rr(k)); - }, sum); - Kokkos::single(Kokkos::PerTeam(member), [&]() { - update += sum; - }); - }, residual); + Kokkos::parallel_reduce( + "compute-residual", policy, + KOKKOS_LAMBDA(const member_type &member, val_type &update) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto rr = Kokkos::subview(r, i, Kokkos::ALL()); + + TeamGemv::invoke(member, -one, AA, xx, one, + rr); + + val_type sum(0); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(member, rr.extent(0)), + [&](const int &k, val_type &lsum) { + lsum += Kokkos::ArithTraits::abs(rr(k)); + }, + sum); + Kokkos::single(Kokkos::PerTeam(member), [&]() { update += sum; }); + }, + residual); } return residual; } -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) @@ -98,16 +92,15 @@ int main(int argc, char* argv[]) { /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int Blk = 5; /// block dimension - for (int i=1;i A("block diagonals", N, Blk, Blk); - Kokkos::View T("temporal block diagonals", N, Blk, Blk); - Kokkos::View x("x", N, Blk); - Kokkos::View b("b", N, Blk); + Kokkos::View A( + "block diagonals", N, Blk, Blk); + Kokkos::View T( + "temporal block diagonals", N, Blk, Blk); + Kokkos::View x("x", N, + Blk); + Kokkos::View b("b", N, + Blk); /// copy of A to check residual - Kokkos::View Acopy("Acopy", - A.extent(0), - A.extent(1), - A.extent(2)); + Kokkos::View Acopy( + "Acopy", A.extent(0), A.extent(1), A.extent(2)); /// residual vector - Kokkos::View r("r", - b.extent(0), - b.extent(1)); + Kokkos::View r( + "r", b.extent(0), b.extent(1)); - /// The block diagonal matrices are assumed to be extracted from a block sparse matrix. - /// Here we set the blocks with random values + /// The block diagonal matrices are assumed to be extracted from a block + /// sparse matrix. Here we set the blocks with random values Kokkos::Random_XorShift64_Pool random(13245); Kokkos::fill_random(A, random, val_type(1.0)); Kokkos::fill_random(b, random, val_type(1.0)); @@ -143,8 +137,8 @@ int main(int argc, char* argv[]) { /// /// Objective : /// - Construct the inverse of A(i,:,:) for all i. - /// - Solve the equation using matrix vector multiplication. - + /// - Solve the equation using matrix vector multiplication. + /// Task 1. Use the so-called standard batch interface /// parallel_for(factorize) /// parallel_For(set identity matrix) @@ -157,90 +151,95 @@ int main(int argc, char* argv[]) { cudaProfilerStart(); #endif Kokkos::deep_copy(A, Acopy); - + /// construction of block jacobi using batched blas interface /// each parallel for is a batch function { - policy_type policy(A.extent(0), Kokkos::AUTO()); - timer.reset(); - Kokkos::parallel_for - ("task1.factorize", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member,AA); - }); - Kokkos::deep_copy(T, A); - Kokkos::parallel_for - ("task1.set-identity", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - TeamSetIdentity::invoke(member, AA); - }); - Kokkos::fence(); - Kokkos::parallel_for - ("task1.solve-lower-triangular", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - Kokkos::parallel_for - ("task1.solve-upper-triangular", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - const double t = timer.seconds(); - printf("task 1: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60); + policy_type policy(A.extent(0), Kokkos::AUTO()); + timer.reset(); + Kokkos::parallel_for( + "task1.factorize", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, AA); + }); + Kokkos::deep_copy(T, A); + Kokkos::parallel_for( + "task1.set-identity", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + TeamSetIdentity::invoke(member, AA); + }); + Kokkos::fence(); + Kokkos::parallel_for( + "task1.solve-lower-triangular", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, one, + TT, AA); + }); + Kokkos::fence(); + Kokkos::parallel_for( + "task1.solve-upper-triangular", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + TeamTrsm::invoke(member, + one, TT, + AA); + }); + Kokkos::fence(); + const double t = timer.seconds(); + printf( + "task 1: construction of jacobi time = %f , # of constructions per " + "min = %.0f \n", + t, 1.0 / t * 60); } - + /// apply block jacobi { - timer.reset(); - policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for - ("task1.apply-block-jacobi", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv - ::invoke(member, one, AA, bb, zero, xx); - }); - const double t = timer.seconds(); - printf("task 1: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60); + timer.reset(); + policy_type policy(A.extent(0), Kokkos::AUTO()); + Kokkos::parallel_for( + "task1.apply-block-jacobi", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(b, i, Kokkos::ALL()); + TeamGemv::invoke(member, one, AA, bb, + zero, xx); + }); + const double t = timer.seconds(); + printf( + "task 1: application of jacobi time = %f , # of applications per " + "min = %.0f \n", + t, 1.0 / t * 60); } /// check residual { - const double residual = computeResidual(Acopy, x, b, r); - printf("task 1: residual = %f\n", residual); + const double residual = computeResidual(Acopy, x, b, r); + printf("task 1: residual = %f\n", residual); } -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) cudaProfilerStop(); -#endif +#endif } - - /// Task 2. Compose a new batch function using kokkos batched team-level interface + + /// Task 2. Compose a new batch function using kokkos batched team-level + /// interface /// parallel_for(LU, set identity, solve lower/upper triangular) /// parallel_for(matrix vector multiplication) @@ -249,78 +248,77 @@ int main(int argc, char* argv[]) { cudaProfilerStart(); #endif Kokkos::deep_copy(A, Acopy); - + /// construction of block jacobi using batched blas interface /// each parallel for is a batch function { - policy_type policy(A.extent(0), Kokkos::AUTO()); - timer.reset(); - Kokkos::parallel_for - ("task2.factorize-invert", - policy, KOKKOS_LAMBDA(const member_type &member) { - const val_type one(1); - const int i = member.league_rank(); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); - - TeamLU::invoke(member,AA); - TeamCopy::invoke(member, AA, TT); - TeamSetIdentity::invoke(member, AA); - TeamTrsm - ::invoke(member, one, TT, AA); - TeamTrsm - ::invoke(member, one, TT, AA); - }); - Kokkos::fence(); - const double t = timer.seconds(); - printf("task 2: construction of jacobi time = %f , # of constructions per min = %.0f \n", t, 1.0/t*60); + policy_type policy(A.extent(0), Kokkos::AUTO()); + timer.reset(); + Kokkos::parallel_for( + "task2.factorize-invert", policy, + KOKKOS_LAMBDA(const member_type &member) { + const val_type one(1); + const int i = member.league_rank(); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto TT = Kokkos::subview(T, i, Kokkos::ALL(), Kokkos::ALL()); + + TeamLU::invoke(member, AA); + TeamCopy::invoke(member, AA, TT); + TeamSetIdentity::invoke(member, AA); + TeamTrsm::invoke(member, one, + TT, AA); + TeamTrsm::invoke(member, + one, TT, + AA); + }); + Kokkos::fence(); + const double t = timer.seconds(); + printf( + "task 2: construction of jacobi time = %f , # of constructions per " + "min = %.0f \n", + t, 1.0 / t * 60); } - + /// apply block jacobi { - timer.reset(); - policy_type policy(A.extent(0), Kokkos::AUTO()); - Kokkos::parallel_for - ("task2.apply-block-jacobi", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - const val_type one(1), zero(0); - auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto xx = Kokkos::subview(x, i, Kokkos::ALL()); - auto bb = Kokkos::subview(b, i, Kokkos::ALL()); - TeamGemv - ::invoke(member, one, AA, bb, zero, xx); - }); - const double t = timer.seconds(); - printf("task 2: application of jacobi time = %f , # of applications per min = %.0f \n", t, 1.0/t*60); + timer.reset(); + policy_type policy(A.extent(0), Kokkos::AUTO()); + Kokkos::parallel_for( + "task2.apply-block-jacobi", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + const val_type one(1), zero(0); + auto AA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto xx = Kokkos::subview(x, i, Kokkos::ALL()); + auto bb = Kokkos::subview(b, i, Kokkos::ALL()); + TeamGemv::invoke(member, one, AA, bb, + zero, xx); + }); + const double t = timer.seconds(); + printf( + "task 2: application of jacobi time = %f , # of applications per " + "min = %.0f \n", + t, 1.0 / t * 60); } /// check residual { - const double residual = computeResidual(Acopy, x, b, r); - printf("task 2: residual = %f\n", residual); + const double residual = computeResidual(Acopy, x, b, r); + printf("task 2: residual = %f\n", residual); } -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) cudaProfilerStop(); -#endif +#endif } - } Kokkos::finalize(); return 0; } - #else -int main() { - return 0; -} +int main() { return 0; } #endif - diff --git a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp index 4183380854..a8b3de209b 100644 --- a/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp +++ b/perf_test/batched/KokkosBatched_Test_BlockTridiagDirect.cpp @@ -3,14 +3,13 @@ #include "Kokkos_Timer.hpp" #include "Kokkos_Random.hpp" -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) #if !defined(KOKKOS_ENABLE_CUDA) || (8000 <= CUDA_VERSION) -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT -#endif +#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) +#define KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT +#endif #endif #endif - #if defined(KOKKOSBATCHED_TEST_BLOCKTRIDIAGDIRECT) @@ -60,55 +59,57 @@ typedef double value_type; /// using namespace KokkosBatched; -static constexpr int vector_length = DefaultVectorLength::value; +static constexpr int vector_length = + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif -typedef Vector,vector_length> vector_type; +typedef Vector, vector_length> vector_type; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -typedef Vector,internal_vector_length> internal_vector_type; +typedef Vector, internal_vector_length> internal_vector_type; #else typedef value_type internal_vector_type; #endif -template +template struct FactorizeModeAndAlgo; -template<> +template <> struct FactorizeModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level3::Blocked algo_type; + typedef Algo::Level3::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct FactorizeModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level3::Unblocked algo_type; + typedef Algo::Level3::Unblocked algo_type; }; #endif -template +template struct SolveModeAndAlgo; -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level2::Blocked algo_type; + typedef Algo::Level2::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level2::Unblocked algo_type; + typedef Algo::Level2::Unblocked algo_type; }; #endif -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -116,20 +117,20 @@ int main(int argc, char* argv[]) { #endif Kokkos::print_configuration(std::cout); - //typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::Details::ArithTraits ats; Kokkos::Timer timer; /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int L = 128; /// length of block tridiags - int Blk = 5; /// block dimension - int Nvec = 1; - int S = 0; /// scratch size + int N = 128 * 128; /// # of problems (batch size) + int L = 128; /// length of block tridiags + int Blk = 5; /// block dimension + int Nvec = 1; + int S = 0; /// scratch size int niter = 1; - for (int i=1;i Av("A", - N/vector_length, L, 3, Blk, Blk); + Kokkos::View Av( + "A", N / vector_length, L, 3, Blk, Blk); /// double - Kokkos::View As((value_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length); + Kokkos::View As( + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), + Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View Ai((internal_vector_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length/internal_vector_length); + Kokkos::View + Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), + Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv("x", - N/vector_length, Nvec, L, Blk); + Kokkos::View xv( + "x", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View xs((value_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - vector_length); + Kokkos::View xs( + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), + xv.extent(3), vector_length); /// double 2 - Kokkos::View xi((internal_vector_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - vector_length/internal_vector_length); + Kokkos::View + xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), + xv.extent(2), xv.extent(3), vector_length / internal_vector_length); /// double 16 - Kokkos::View bv("b", - N/vector_length, Nvec, L, Blk); + Kokkos::View bv( + "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs((value_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length); + Kokkos::View bs( + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), + bv.extent(3), vector_length); /// double 2 - Kokkos::View bi((internal_vector_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length/internal_vector_length); - + Kokkos::View + bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), + bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy("Acopy", - As.extent(0), - As.extent(1), - As.extent(2), - As.extent(3), - As.extent(4), - As.extent(5)); - - Kokkos::View rs("rs", - bs.extent(0), - bs.extent(1), - bs.extent(2), - bs.extent(3), - bs.extent(4)); + Kokkos::View Acopy( + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), + As.extent(4), As.extent(5)); + + Kokkos::View rs( + "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), + bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -245,17 +220,21 @@ int main(int argc, char* argv[]) { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; policy_type policy(AA.extent(0), Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for - ("setTridiagToIdentity", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,AA.extent(1)),[&](const int &j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - for (int k=0,kend=AA.extent(3);k random(13245); Kokkos::fill_random(As, random, one); Kokkos::fill_random(bs, random, one); - + Kokkos::deep_copy(Acopy, As); } @@ -284,70 +263,76 @@ int main(int argc, char* argv[]) { timer.reset(); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 64/AA.extent(5); - } else { team_size = 128/AA.extent(5); } + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 64 / AA.extent(5); + } else { + team_size = 128 / AA.extent(5); + } policy_type policy(AA.extent(0), team_size, AA.extent(5)); - Kokkos::parallel_for - ("factorize", - policy.set_scratch_size(0,Kokkos::PerTeam(S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef FactorizeModeAndAlgo default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank(); - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto AAA = Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), v); - - /// subview patterns - auto A = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - auto D = Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); - - if (L == 1) { - A.assign_data( &AAA(0, 1, 0, 0) ); - LU::invoke(member, A); - } else { - for (int k=0;k<(L-1);++k) { - A.assign_data( &AAA(k, 1, 0, 0) ); - B.assign_data( &AAA(k, 2, 0, 0) ); - C.assign_data( &AAA(k, 0, 0, 0) ); - D.assign_data( &AAA(k+1, 1, 0, 0) ); - - LU - ::invoke(member, A); - Trsm - ::invoke(member, 1.0, A, B); - Trsm - ::invoke(member, 1.0, A, C); - Gemm - ::invoke(member, -1.0, C, B, 1.0, D); - } - LU - ::invoke(member, D); - } - }); - }); + Kokkos::parallel_for( + "factorize", policy.set_scratch_size(0, Kokkos::PerTeam(S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef FactorizeModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto AAA = + Kokkos::subview(AA, i, Kokkos::ALL(), Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), v); + + /// subview patterns + auto A = + Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + auto B = + Kokkos::subview(AAA, 0, 2, Kokkos::ALL(), Kokkos::ALL()); + auto C = + Kokkos::subview(AAA, 0, 0, Kokkos::ALL(), Kokkos::ALL()); + auto D = + Kokkos::subview(AAA, 0, 1, Kokkos::ALL(), Kokkos::ALL()); + + if (L == 1) { + A.assign_data(&AAA(0, 1, 0, 0)); + LU::invoke(member, A); + } else { + for (int k = 0; k < (L - 1); ++k) { + A.assign_data(&AAA(k, 1, 0, 0)); + B.assign_data(&AAA(k, 2, 0, 0)); + C.assign_data(&AAA(k, 0, 0, 0)); + D.assign_data(&AAA(k + 1, 1, 0, 0)); + + LU::invoke(member, A); + Trsm::invoke(member, 1.0, A, B); + Trsm::invoke(member, 1.0, A, C); + Gemm::invoke(member, -1.0, C, B, + 1.0, D); + } + LU::invoke(member, D); + } + }); + }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("factorize time = %f , # of factorization per min = %f \n", t, 1.0/t*60); + printf("factorize time = %f , # of factorization per min = %f \n", t, + 1.0 / t * 60); } /// @@ -360,133 +345,144 @@ int main(int argc, char* argv[]) { timer.reset(); using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 64/AA.extent(5); - } else { team_size = 128/AA.extent(5); } - - policy_type policy(AA.extent(0), team_size, AA.extent(5)); - for (int iter=0;iter default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 64 / AA.extent(5); + } else { + team_size = 128 / AA.extent(5); + } - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0;jvec - ::invoke(member, bk, xb); - member.team_barrier(); - } - } - const int kend = L - 1; - for (int k=0;k - ::invoke(member, bk, xb); - } - - Trsv - ::invoke(member, 1.0, LT, xt); - - Gemv - ::invoke(member, -1.0, LB, xt, 1.0, xb); - } - { - LT.assign_data(&A(kend, 0, 0)); - xt.assign_data(&x(kend, 0)); - Trsv - ::invoke(member, 1.0, LT, xt); - } - } /// end forward substitution - - /// - /// backward substitution - /// - { - auto UT = Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); - auto UB = Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); - - const int kbegin = L - 1; - for (int k=kbegin;k>0;--k) { - UT.assign_data(&B(k-1, 0, 0)); - UB.assign_data(&A(k, 0, 0)); - - xt.assign_data(&x(k-1, 0)); - xb.assign_data(&x(k, 0)); - - Trsv - ::invoke(member, 1.0, UB, xb); - - Gemv - ::invoke(member, -1.0, UT, xb, 1.0, xt); - } - { - UB.assign_data(&A(0, 0, 0)); - xb.assign_data(&x(0, 0)); - Trsv - ::invoke(member, 1.0, UB, xb); + policy_type policy(AA.extent(0), team_size, AA.extent(5)); + for (int iter = 0; iter < niter; ++iter) { + Kokkos::parallel_for( + "solve", policy.set_scratch_size(0, Kokkos::PerTeam(S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x = Kokkos::subview(xx, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto b = Kokkos::subview(bb, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + auto xt = Kokkos::subview(x, 0, Kokkos::ALL()); + auto xb = Kokkos::subview(x, 0, Kokkos::ALL()); + + /// + /// forward substitution + /// + { + // const bool is_same_x_and_b = (x.data() == b.data()); + auto LT = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto LB = + Kokkos::subview(C, 0, Kokkos::ALL(), Kokkos::ALL()); + + auto bk = Kokkos::subview(b, 0, Kokkos::ALL()); + { + { // if (!is_same_x_and_b) { + Copy::invoke(member, bk, xb); + member.team_barrier(); + } + } + const int kend = L - 1; + for (int k = 0; k < kend; ++k) { + LT.assign_data(&A(k, 0, 0)); + LB.assign_data(&C(k, 0, 0)); + + xt.assign_data(&x(k, 0)); + xb.assign_data(&x(k + 1, 0)); + + { // if (!is_same_x_and_b) { + bk.assign_data(&b(k + 1, 0)); + Copy::invoke(member, bk, xb); + } + + Trsv::invoke(member, + 1.0, + LT, + xt); + + Gemv::invoke(member, -1.0, LB, xt, 1.0, + xb); + } + { + LT.assign_data(&A(kend, 0, 0)); + xt.assign_data(&x(kend, 0)); + Trsv::invoke(member, + 1.0, + LT, + xt); + } + } /// end forward substitution + + /// + /// backward substitution + /// + { + auto UT = + Kokkos::subview(B, 0, Kokkos::ALL(), Kokkos::ALL()); + auto UB = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + + const int kbegin = L - 1; + for (int k = kbegin; k > 0; --k) { + UT.assign_data(&B(k - 1, 0, 0)); + UB.assign_data(&A(k, 0, 0)); + + xt.assign_data(&x(k - 1, 0)); + xb.assign_data(&x(k, 0)); + + Trsv::invoke(member, 1.0, UB, xb); + + Gemv::invoke(member, -1.0, UT, xb, 1.0, + xt); + } + { + UB.assign_data(&A(0, 0, 0)); + xb.assign_data(&x(0, 0)); + Trsv::invoke(member, 1.0, UB, xb); + } + } // end backward substitution } - } // end backward substitution - } - }); - }); + }); + }); Kokkos::fence(); } const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter); + printf("solve time = %f , # of solves per min = %f\n", t, + 1.0 / t * 60 * niter); } - + /// /// compute residual /// @@ -495,105 +491,118 @@ int main(int argc, char* argv[]) { using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); - Kokkos::parallel_for - ("compute residual", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0,jvecend=rs.extent(1);jvec - ::invoke(member, b0, r0); - TeamGemv - ::invoke(member, -1.0, A0, x0, 1.0, r0); - } else { - int k = 0; - { - /// first row - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - ++k; - } - for (;k<(L-1);++k) { - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); + Kokkos::parallel_for( + "compute residual", policy, KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, Acopy.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; + ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, + b0, r0); + TeamGemv::invoke(member, -1.0, A0, x0, 1.0, + r0); + } else { + int k = 0; + { + /// first row + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, + rk); + ++k; + } + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, + rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + TeamGemv::invoke(member, -1.0, B2, x2, 1.0, + rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke( + member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, -1.0, C0, x0, 1.0, + rk); + TeamGemv::invoke(member, -1.0, A1, x1, 1.0, + rk); + } + } } - { - // last row - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - } - } - } - }); - }); + }); + }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); auto bs_host = Kokkos::create_mirror_view(bs); @@ -602,17 +611,19 @@ int main(int argc, char* argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0=0,i0end=rs.extent(0);i0::value; +static constexpr int vector_length = + DefaultVectorLength::value; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -static constexpr int internal_vector_length = DefaultInternalVectorLength::value; +static constexpr int internal_vector_length = + DefaultInternalVectorLength::value; #else static constexpr int internal_vector_length = 1; #endif -typedef Vector,vector_length> vector_type; +typedef Vector, vector_length> vector_type; #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) -typedef Vector,internal_vector_length> internal_vector_type; +typedef Vector, internal_vector_length> internal_vector_type; #else typedef value_type internal_vector_type; #endif -template +template struct InverseDiagonalsModeAndAlgo; -template<> +template <> struct InverseDiagonalsModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level3::Blocked algo_type; + typedef Algo::Level3::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct InverseDiagonalsModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level3::Unblocked algo_type; + typedef Algo::Level3::Unblocked algo_type; }; #endif -template +template struct SolveModeAndAlgo; -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Serial mode_type; - typedef Algo::Level2::Blocked algo_type; + typedef Algo::Level2::Blocked algo_type; }; #if defined(KOKKOS_ENABLE_CUDA) -template<> +template <> struct SolveModeAndAlgo { typedef Mode::Team mode_type; - typedef Algo::Level2::Unblocked algo_type; + typedef Algo::Level2::Unblocked algo_type; }; #endif -int main(int argc, char* argv[]) { +int main(int argc, char *argv[]) { Kokkos::initialize(argc, argv); { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -115,21 +117,21 @@ int main(int argc, char* argv[]) { #endif Kokkos::print_configuration(std::cout); - //typedef Kokkos::Details::ArithTraits ats; + // typedef Kokkos::Details::ArithTraits ats; Kokkos::Timer timer; /// /// input arguments parsing /// - int N = 128*128; /// # of problems (batch size) - int L = 128; /// length of block tridiags - int Blk = 5; /// block dimension - int Nvec = 1; - int S = 0; /// scratch size - int niter = 1; - int nsweep = 10; - for (int i=1;i Av("A", - N/vector_length, L, 4, Blk, Blk); + Kokkos::View Av( + "A", N / vector_length, L, 4, Blk, Blk); /// double - Kokkos::View As((value_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length); + Kokkos::View As( + (value_type *)Av.data(), Av.extent(0), Av.extent(1), Av.extent(2), + Av.extent(3), Av.extent(4), vector_length); /// double 2 - Kokkos::View Ai((internal_vector_type*)Av.data(), - Av.extent(0), - Av.extent(1), - Av.extent(2), - Av.extent(3), - Av.extent(4), - vector_length/internal_vector_length); + Kokkos::View + Ai((internal_vector_type *)Av.data(), Av.extent(0), Av.extent(1), + Av.extent(2), Av.extent(3), Av.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View xv("x", - N/vector_length, Nvec, 2, L, Blk); + Kokkos::View xv( + "x", N / vector_length, Nvec, 2, L, Blk); /// double - Kokkos::View xs((value_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - xv.extent(4), - vector_length); + Kokkos::View xs( + (value_type *)xv.data(), xv.extent(0), xv.extent(1), xv.extent(2), + xv.extent(3), xv.extent(4), vector_length); /// double 2 - Kokkos::View xi((internal_vector_type*)xv.data(), - xv.extent(0), - xv.extent(1), - xv.extent(2), - xv.extent(3), - xv.extent(4), - vector_length/internal_vector_length); + Kokkos::View + xi((internal_vector_type *)xv.data(), xv.extent(0), xv.extent(1), + xv.extent(2), xv.extent(3), xv.extent(4), + vector_length / internal_vector_length); /// double 16 - Kokkos::View bv("b", - N/vector_length, Nvec, L, Blk); + Kokkos::View bv( + "b", N / vector_length, Nvec, L, Blk); /// double - Kokkos::View bs((value_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length); + Kokkos::View bs( + (value_type *)bv.data(), bv.extent(0), bv.extent(1), bv.extent(2), + bv.extent(3), vector_length); /// double 2 - Kokkos::View bi((internal_vector_type*)bv.data(), - bv.extent(0), - bv.extent(1), - bv.extent(2), - bv.extent(3), - vector_length/internal_vector_length); - + Kokkos::View + bi((internal_vector_type *)bv.data(), bv.extent(0), bv.extent(1), + bv.extent(2), bv.extent(3), vector_length / internal_vector_length); /// double copy of A - Kokkos::View Acopy("Acopy", - As.extent(0), - As.extent(1), - As.extent(2), - As.extent(3), - As.extent(4), - As.extent(5)); - - Kokkos::View rs("rs", - bs.extent(0), - bs.extent(1), - bs.extent(2), - bs.extent(3), - bs.extent(4)); + Kokkos::View Acopy( + "Acopy", As.extent(0), As.extent(1), As.extent(2), As.extent(3), + As.extent(4), As.extent(5)); + + Kokkos::View rs( + "rs", bs.extent(0), bs.extent(1), bs.extent(2), bs.extent(3), + bs.extent(4)); #if defined(KOKKOSBATCHED_USE_128BIT_MEMORY_INST) auto AA = Ai; @@ -243,7 +218,7 @@ int main(int argc, char* argv[]) { Kokkos::fill_random(bs, random, value_type(1.0)); /// - /// diagonal dominant + /// diagonal dominant /// if (1) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) @@ -251,18 +226,21 @@ int main(int argc, char* argv[]) { #endif using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - policy_type policy(AA.extent(0)*L, Kokkos::AUTO(), AA.extent(5)); - Kokkos::parallel_for - ("diagonal dominant", - policy, KOKKOS_LAMBDA(const member_type &member) { - const int i = member.league_rank()/L; - const int k = member.league_rank()%L; - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,Blk),[&](const int &j) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - AA(i, k, 1, j, j, v) += internal_vector_type(9*Blk); + policy_type policy(AA.extent(0) * L, Kokkos::AUTO(), AA.extent(5)); + Kokkos::parallel_for( + "diagonal dominant", policy, + KOKKOS_LAMBDA(const member_type &member) { + const int i = member.league_rank() / L; + const int k = member.league_rank() % L; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, Blk), [&](const int &j) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + AA(i, k, 1, j, j, v) += internal_vector_type(9 * Blk); + }); }); - }); - }); + }); Kokkos::fence(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); @@ -280,61 +258,70 @@ int main(int argc, char* argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - + typedef Kokkos::View + scratch_view_type; + using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - const int per_team_scratch = scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 32/AA.extent(5); - } else { team_size = 64/AA.extent(5); } - - policy_type policy(AA.extent(0)*L, team_size, AA.extent(5)); - Kokkos::parallel_for - ("inverse diagonals", - policy.set_scratch_size(0,Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), - KOKKOS_LAMBDA(const member_type &member) { - typedef InverseDiagonalsModeAndAlgo default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - const int i = member.league_rank()/L; - const int k = member.league_rank()%L; - - scratch_view_type WW(member.team_scratch(0), Blk, Blk, AA.extent(5)); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); - auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); - - Copy - ::invoke(member, A, W); - SetIdentity - ::invoke(member, D); - member.team_barrier(); - LU::invoke(member, W); - Trsm - ::invoke(member, 1.0, W, D); - Trsm - ::invoke(member, 1.0, W, D); - }); - }); + const int per_team_scratch = + scratch_view_type::shmem_size(Blk, Blk, AA.extent(5)); + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 32 / AA.extent(5); + } else { + team_size = 64 / AA.extent(5); + } + + policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); + Kokkos::parallel_for( + "inverse diagonals", + policy.set_scratch_size( + 0, Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef InverseDiagonalsModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + const int i = member.league_rank() / L; + const int k = member.league_rank() % L; + + scratch_view_type WW(member.team_scratch(0), Blk, Blk, + AA.extent(5)); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), + Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), + Kokkos::ALL(), v); + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + + Copy::invoke( + member, A, W); + SetIdentity::invoke(member, D); + member.team_barrier(); + LU::invoke(member, W); + Trsm::invoke(member, 1.0, W, + D); + Trsm::invoke(member, 1.0, + W, D); + }); + }); Kokkos::fence(); const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("inverse time = %f , # of inverse per min = %f \n", t, 1.0/t*60); + printf("inverse time = %f , # of inverse per min = %f \n", t, + 1.0 / t * 60); } /// @@ -346,75 +333,114 @@ int main(int argc, char* argv[]) { #endif timer.reset(); typedef internal_vector_type scratch_value_type; - typedef Kokkos::View scratch_view_type; - const int per_team_scratch = scratch_view_type::shmem_size(Blk, AA.extent(5)); - + typedef Kokkos::View + scratch_view_type; + const int per_team_scratch = + scratch_view_type::shmem_size(Blk, AA.extent(5)); + using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; - int team_size = 0; - if (Blk < 8) { team_size = 32/AA.extent(5); - } else if (Blk < 12) { team_size = 32/AA.extent(5); - } else { team_size = 32/AA.extent(5); } - policy_type policy(AA.extent(0)*L, team_size, AA.extent(5)); - - for (int iter=0;iter default_mode_and_algo_type; - typedef default_mode_and_algo_type::mode_type mode_type; - typedef default_mode_and_algo_type::algo_type algo_type; - - scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); - const int i = member.league_rank()/L; //%AA.extent(0); - const int k = member.league_rank()%L; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, AA.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(AA, i, k ? k-1 : 0, 0, Kokkos::ALL(), Kokkos::ALL(), v); - auto u = Kokkos::subview(WW, Kokkos::ALL(), v); - for (int jvec=0;jvec::invoke(member, 1.0, D, b, 0.0, x1); - } else { - Copy::invoke(member, b, u); - if (k == 0) { - Gemv::invoke(member, -1.0, B, x2, 1.0, u); - } else if (k == L-1) { - Gemv::invoke(member, -1.0, C, x0, 1.0, u); - } else { - Gemv::invoke(member, -1.0, B, x2, 1.0, u); - Gemv::invoke(member, -1.0, C, x0, 1.0, u); - } - Gemv::invoke(member, 1.0, D, u, 0.0, y1); - } - } - }); - }); - auto tmp = xxx; xxx = yyy; yyy = tmp; - } - Kokkos::fence(); + int team_size = 0; + if (Blk < 8) { + team_size = 32 / AA.extent(5); + } else if (Blk < 12) { + team_size = 32 / AA.extent(5); + } else { + team_size = 32 / AA.extent(5); + } + policy_type policy(AA.extent(0) * L, team_size, AA.extent(5)); + + for (int iter = 0; iter < niter; ++iter) { + auto xxx = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto yyy = Kokkos::subview(xx, Kokkos::ALL(), Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + + for (int nis = 0; nis < nsweep; ++nis) { + Kokkos::parallel_for( + "solve", + policy.set_scratch_size( + 0, + Kokkos::PerTeam(S < per_team_scratch ? per_team_scratch : S)), + KOKKOS_LAMBDA(const member_type &member) { + typedef SolveModeAndAlgo< + Kokkos::Impl::ActiveExecutionMemorySpace> + default_mode_and_algo_type; + typedef default_mode_and_algo_type::mode_type mode_type; + typedef default_mode_and_algo_type::algo_type algo_type; + + scratch_view_type WW(member.team_scratch(0), Blk, AA.extent(5)); + const int i = member.league_rank() / L; //%AA.extent(0); + const int k = member.league_rank() % L; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, AA.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(AA, i, k, 1, Kokkos::ALL(), + Kokkos::ALL(), v); + auto D = Kokkos::subview(AA, i, k, 3, Kokkos::ALL(), + Kokkos::ALL(), v); + auto B = Kokkos::subview(AA, i, k, 2, Kokkos::ALL(), + Kokkos::ALL(), v); + auto C = Kokkos::subview(AA, i, k ? k - 1 : 0, 0, + Kokkos::ALL(), Kokkos::ALL(), v); + auto u = Kokkos::subview(WW, Kokkos::ALL(), v); + for (int jvec = 0; jvec < Nvec; ++jvec) { + auto x0 = Kokkos::subview( + xxx, i, jvec, k == 0 ? 0 : k - 1, Kokkos::ALL(), v); + auto x1 = + Kokkos::subview(xxx, i, jvec, k, Kokkos::ALL(), v); + auto x2 = Kokkos::subview(xxx, i, jvec, + k == L - 1 ? 0 : k + 1, + Kokkos::ALL(), v); + auto y1 = + Kokkos::subview(yyy, i, jvec, k, Kokkos::ALL(), v); + auto b = + Kokkos::subview(bb, i, jvec, k, Kokkos::ALL(), v); + + if (L == 1) { + Gemv::invoke(member, 1.0, D, b, 0.0, x1); + } else { + Copy::invoke(member, b, u); + if (k == 0) { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + } else if (k == L - 1) { + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } else { + Gemv::invoke(member, -1.0, B, x2, 1.0, + u); + Gemv::invoke(member, -1.0, C, x0, 1.0, + u); + } + Gemv::invoke(member, 1.0, D, u, 0.0, y1); + } + } + }); + }); + auto tmp = xxx; + xxx = yyy; + yyy = tmp; + } + Kokkos::fence(); } const double t = timer.seconds(); #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSBATCHED_PROFILE) cudaProfilerStop(); #endif - printf("solve time = %f , # of solves per min = %f\n", t, 1.0/t*60*niter); + printf("solve time = %f , # of solves per min = %f\n", t, + 1.0 / t * 60 * niter); } - + /// /// compute residual /// @@ -422,105 +448,142 @@ int main(int argc, char* argv[]) { typedef KokkosBatched::Algo::Level2::Unblocked algo_type; using policy_type = Kokkos::TeamPolicy; policy_type policy(Acopy.extent(0), Kokkos::AUTO(), Acopy.extent(5)); - Kokkos::parallel_for - ("compute residual", - policy, KOKKOS_LAMBDA(const typename policy_type::member_type &member) { - const int i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, Acopy.extent(5)),[&](const int &v) { - auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, Kokkos::ALL(), Kokkos::ALL(), v); - auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, Kokkos::ALL(), Kokkos::ALL(), v); - auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL(), v); - - for (int jvec=0,jvecend=rs.extent(1);jvec - ::invoke(member, b0, r0); - TeamGemv - ::invoke(member, -1.0, A0, x0, 1.0, r0); - } else { - int k = 0; - { - /// first row - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - ++k; + Kokkos::parallel_for( + "compute residual", policy, + KOKKOS_LAMBDA(const typename policy_type::member_type &member) { + const int i = member.league_rank(); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, Acopy.extent(5)), + [&](const int &v) { + auto A = Kokkos::subview(Acopy, i, Kokkos::ALL(), 1, + Kokkos::ALL(), Kokkos::ALL(), v); + auto B = Kokkos::subview(Acopy, i, Kokkos::ALL(), 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(Acopy, i, Kokkos::ALL(), 0, + Kokkos::ALL(), Kokkos::ALL(), v); + + for (int jvec = 0, jvecend = rs.extent(1); jvec < jvecend; + ++jvec) { + auto x = Kokkos::subview(xs, i, jvec, nsweep % 2, + Kokkos::ALL(), Kokkos::ALL(), v); + auto b = Kokkos::subview(bs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + auto r = Kokkos::subview(rs, i, jvec, Kokkos::ALL(), + Kokkos::ALL(), v); + + if (L == 1) { + auto A0 = + Kokkos::subview(A, 0, Kokkos::ALL(), Kokkos::ALL()); + auto x0 = Kokkos::subview(x, 0, Kokkos::ALL()); + auto b0 = Kokkos::subview(b, 0, Kokkos::ALL()); + auto r0 = Kokkos::subview(r, 0, Kokkos::ALL()); + + TeamCopy::invoke(member, b0, r0); + TeamGemv::invoke(member, + -1.0, A0, + x0, 1.0, + r0); + } else { + int k = 0; + { + /// first row + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + B2, x2, + 1.0, + rk); + ++k; + } + for (; k < (L - 1); ++k) { + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + auto B2 = + Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + auto x2 = Kokkos::subview(x, k + 1, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + C0, x0, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + B2, x2, + 1.0, + rk); + } + { + // last row + auto C0 = Kokkos::subview(C, k - 1, Kokkos::ALL(), + Kokkos::ALL()); + auto A1 = + Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); + + auto x0 = Kokkos::subview(x, k - 1, Kokkos::ALL()); + auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); + + auto bk = Kokkos::subview(b, k, Kokkos::ALL()); + auto rk = Kokkos::subview(r, k, Kokkos::ALL()); + TeamCopy::invoke(member, bk, rk); + member.team_barrier(); + TeamGemv::invoke(member, + -1.0, + C0, x0, + 1.0, + rk); + TeamGemv::invoke(member, + -1.0, + A1, x1, + 1.0, + rk); + } + } } - for (;k<(L-1);++k) { - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - auto B2 = Kokkos::subview(B, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - auto x2 = Kokkos::subview(x, k+1, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, B2, x2, 1.0, rk); - } - { - // last row - auto C0 = Kokkos::subview(C, k-1, Kokkos::ALL(), Kokkos::ALL()); - auto A1 = Kokkos::subview(A, k, Kokkos::ALL(), Kokkos::ALL()); - - auto x0 = Kokkos::subview(x, k-1, Kokkos::ALL()); - auto x1 = Kokkos::subview(x, k, Kokkos::ALL()); - - auto bk = Kokkos::subview(b, k, Kokkos::ALL()); - auto rk = Kokkos::subview(r, k, Kokkos::ALL()); - TeamCopy - ::invoke(member, bk, rk); - member.team_barrier(); - TeamGemv - ::invoke(member, -1.0, C0, x0, 1.0, rk); - TeamGemv - ::invoke(member, -1.0, A1, x1, 1.0, rk); - } - } - } - }); - }); + }); + }); Kokkos::fence(); auto rs_host = Kokkos::create_mirror_view(rs); auto bs_host = Kokkos::create_mirror_view(bs); @@ -529,17 +592,19 @@ int main(int argc, char* argv[]) { Kokkos::fence(); { double norm2 = 0, diff2 = 0; - for (int i0=0,i0end=rs.extent(0);i0 @@ -24,622 +24,638 @@ #include "KokkosBatched_Gemm_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; +typedef double value_type; - double FlopCount(int mm, int nn, int kk) { - double m = (double)mm; double n = (double)nn; double k = (double)kk; - return (FLOP_MUL*(m*n*k) + - FLOP_ADD*(m*n*k)); - } +double FlopCount(int mm, int nn, int kk) { + double m = (double)mm; + double n = (double)nn; + double k = (double)kk; + return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k)); +} - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; - - template - struct Functor { - ConstUnmanagedViewType _a, _b; - UnmanagedViewType _c; - - KOKKOS_INLINE_FUNCTION - Functor() = default; - - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, - const ViewType &b, - const ViewType &c) - : _a(a), _b(b), _c(c) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm:: - invoke(1.0, aa, bb, 1.0, cc); - } +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - SerialGemm:: - invoke(1.0, aa, bb, 1.0, cc); - } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - TeamGemm:: - invoke(member, 1.0, aa, bb, 1.0, cc); - } - }); - } +template +struct Functor { + ConstUnmanagedViewType _a, _b; + UnmanagedViewType _c; - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - TeamCopy::invoke(member, bb, sbb); - member.team_barrier(); - - TeamGemm:: - invoke(member, 1.0, saa, sbb, 1.0, cc); - } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagHandmade &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_c.extent(0))) { - const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member,0,m*n), - [&](const int &ij) { - const int i = ij%m, j = ij/m; - typename ViewType::non_const_value_type cval = 0; - for (int p=0;p - void Gemm(const int NN, const int BlkSize) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } + KOKKOS_INLINE_FUNCTION + Functor() = default; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); - const double tmax = 1.0e15; + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a, const ViewType &b, const ViewType &c) + : _a(a), _b(b), _c(c) {} - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), Kokkos::ALL()); - const int iter_begin = -3, iter_end = 30; - Kokkos::Timer timer; + SerialGemm::invoke( + 1.0, aa, bb, 1.0, cc); + } - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, BlkSize), - cref("cref", N*VectorLength, BlkSize, BlkSize); + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + } + }); + } - { - Random random; - for (int k=0;k + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + TeamGemm::invoke(member, 1.0, aa, bb, 1.0, cc); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + ScratchViewType sb(member.team_scratch(lvl), VectorLength, + _b.extent(1), _b.extent(2)); + + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(_c, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + auto sbb = Kokkos::subview(sb, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + TeamCopy::invoke(member, bb, sbb); + member.team_barrier(); + + TeamGemm::invoke(member, 1.0, saa, sbb, 1.0, cc); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagHandmade &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_c.extent(0))) { + const int m = _c.extent(1), n = _c.extent(2), q = _a.extent(2); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, m * n), [&](const int &ij) { + const int i = ij % m, j = ij / m; + typename ViewType::non_const_value_type cval = 0; + for (int p = 0; p < q; ++p) + cval += _a(kk, i, p) * _b(kk, p, j); + _c(kk, i, j) += cval; + }); + } + }); + } +}; + +template +void Gemm(const int NN, const int BlkSize) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } - // P100 L2 cache 4MB per core - constexpr size_t LLC_CAPACITY = 56*4*1024*1024; - Flush flush; + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); + const double tmax = 1.0e15; + + typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; + typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; + + const int iter_begin = -3, iter_end = 30; + Kokkos::Timer timer; + + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize), + cref("cref", N * VectorLength, BlkSize, BlkSize); + + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat(k, i, j) = random.value(); + bmat(k, i, j) = random.value(); + } + } + + // P100 L2 cache 4MB per core + constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024; + Flush flush; #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Strided version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); + if (1) { + /// + /// CUBLAS Strided version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride), + b("b", stride), c("c", stride); + + double tavg = 0, tmin = tmax; - Kokkos::View - a("a", stride), - b("b", stride), - c("c", stride); + cublasStatus_t stat; + cublasHandle_t handle; - double tavg = 0, tmin = tmax; + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); - cublasStatus_t stat; - cublasHandle_t handle; + auto amat_device = + Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); + auto bmat_device = + Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + Kokkos::deep_copy(amat_device, amat); + Kokkos::deep_copy(bmat_device, bmat); - auto amat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), amat); - auto bmat_device = Kokkos::create_mirror_view(DeviceMemorySpaceType(), bmat); + Kokkos::fence(); - Kokkos::deep_copy(amat_device, amat); - Kokkos::deep_copy(bmat_device, bmat); + const double one(1.0), zero(0.0); + { + tavg = 0; + tmin = tmax; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_device); + Kokkos::deep_copy(b, bmat_device); + Kokkos::deep_copy(c, 0); Kokkos::fence(); + timer.reset(); - const double one(1.0), zero(0.0); - { - tavg = 0; tmin = tmax; - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - Kokkos::deep_copy(cref, csol); - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Strided" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = N/A" - << " ScratchSize (KB) = 0" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; - } - cublasDestroy(handle); + stat = cublasDgemmStridedBatched( + handle, CUBLAS_OP_N, CUBLAS_OP_N, BlkSize, BlkSize, BlkSize, &one, + (const value_type *)a.data(), BlkSize, BlkSize * BlkSize, + (const value_type *)b.data(), BlkSize, BlkSize * BlkSize, &zero, + (value_type *)c.data(), BlkSize, BlkSize * BlkSize, + N * VectorLength); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + Kokkos::deep_copy(cref, csol); + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Strided" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + cublasDestroy(handle); + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::RangeTag", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - + csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V1 - almost same scheduling with range policy; - /// expect the same performance as range policy - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - - typedef Functor functor_type; - - // 128 is rough estimates - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + + typedef Functor functor_type; + + // 128 is rough estimates + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV1", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - + csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V2 - team parallel - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*mblk,4), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); + + policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::GemmCuda::TeamPolicyV2", + policy, functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - + csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V3 - team parallel + scratch - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch = 2*ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - //std::cout << "per team scratch " << per_team_scratch << "\n"; - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Gemm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*mblk,4), max_team_size); - - policy_type policy = policy_type(N, team_size, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = 2 * ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + // std::cout << "per team scratch " << per_team_scratch << "\n"; + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Gemm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * mblk, 4), max_team_size); + + policy_type policy = + policy_type(N, team_size, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmCuda::TeamPolicyV3", policy, + functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = Kokkos::create_mirror_view( + typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - + csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific + << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } else { - std::cout << std::setw(8) << "Kokkos" - << std::setw(8) << "Team V3" - << " Scratch per team is too big:" << std::setw(3) << (per_team_scratch/1024) - << std::endl; - } - } + std::cout << std::endl; + } else { + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " Scratch per team is too big:" << std::setw(3) + << (per_team_scratch / 1024) << std::endl; } + } + } - if (1) { - /// - /// Team policy - handmade - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - - const int team_size = std::min(max_team_size,BlkSize*BlkSize); - - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto csol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); - Kokkos::deep_copy(csol, c); - - double diff = 0; - for (int i=0,iend=cref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + + const int team_size = std::min(max_team_size, BlkSize * BlkSize); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmCuda::TeamPolicyHandmade", policy, + functor_type(a, b, c)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto csol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), c); + Kokkos::deep_copy(csol, c); + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(cref(i, j, k) - + csol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team HM" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } - std::cout << std::endl; } } -} + std::cout << std::endl; +} +} // namespace PerfTest +} // namespace KokkosBatched using namespace KokkosBatched; -template +template void run(const int N, const int B) { typedef Kokkos::DefaultExecutionSpace ExecSpace; @@ -648,27 +664,25 @@ void run(const int N, const int B) { if (B != 0) { PerfTest::Gemm(N, B); } else { - PerfTest::Gemm(N, 3); - PerfTest::Gemm(N, 5); + PerfTest::Gemm(N, 3); + PerfTest::Gemm(N, 5); PerfTest::Gemm(N, 10); PerfTest::Gemm(N, 15); - + // PerfTest::Gemm(N, 4); // PerfTest::Gemm(N, 8); // PerfTest::Gemm(N, 16); // PerfTest::Gemm(N, 18); } - } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0; + int N = 128 * 128, B = 0; - for (int i=1;i(N, B); - - std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n"; + + std::cout << "\n Testing LayoutLeft Algo::Gemm::Blocked\n"; run(N, B); } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp index 4e827f34b6..de67d9c804 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host.hpp @@ -23,533 +23,512 @@ //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD -#if defined( KokkosBatched_Test_Gemm_Host_Complex ) +#if defined(KokkosBatched_Test_Gemm_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Gemm_Host_Real ) +#if defined(KokkosBatched_Test_Gemm_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn, int kk) { - double m = (double)mm; double n = (double)nn; double k = (double)kk; - return (FLOP_MUL*(m*n*k) + - FLOP_ADD*(m*n*k)); - } - - template - void Gemm(const int NN) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn, int kk) { + double m = (double)mm; + double n = (double)nn; + double k = (double)kk; + return (FLOP_MUL * (m * n * k) + FLOP_ADD * (m * n * k)); +} + +template +void Gemm(const int NN) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } - - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize,BlkSize); - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - Kokkos::View cref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, BlkSize); - - Kokkos::Random_XorShift64_Pool random(13718); - Kokkos::fill_random(amat, random, value_type(1.0)); - Kokkos::fill_random(bmat, random, value_type(1.0)); - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - bmat_simd("bmat_simd", N, BlkSize, BlkSize); - - Kokkos::parallel_for("KokkosBatched::PerfTest::GemmHost::Pack", - Kokkos::RangePolicy(0, N*VectorLength), - KOKKOS_LAMBDA(const int k) { - const int k0 = k/VectorLength, k1 = k%VectorLength; - for (int i=0;i flush; - - /// - /// Reference version using MKL DGEMM - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter::value) { - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - BlkSize, BlkSize, BlkSize, - one, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0(), - one, - (double*)cc.data(), cc.stride_0()); - } else if (std::is_same >::value) { - cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - BlkSize, BlkSize, BlkSize, - (void*)&one, - (void*)aa.data(), aa.stride_0(), - (void*)bb.data(), bb.stride_0(), - (void*)&one, - (void*)cc.data(), cc.stride_0()); - } - - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize, BlkSize); + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + Kokkos::View cref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, BlkSize); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(amat, random, value_type(1.0)); + Kokkos::fill_random(bmat, random, value_type(1.0)); + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + bmat_simd("bmat_simd", N, BlkSize, BlkSize); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::Pack", + Kokkos::RangePolicy(0, N * VectorLength), + KOKKOS_LAMBDA(const int k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat_simd(k0, i, j)[k1] = amat(k, i, j); + bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } - tavg /= iter_end; + }); - std::cout << std::setw(12) << "MKL DGEMM" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; + // for KNL (1MB per tile) + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - cref = c; - } + /// + /// Reference version using MKL DGEMM + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), + c("c", N * VectorLength, BlkSize, BlkSize); + + { + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::CblasOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + const double one = 1.0; + if (std::is_same::value) { + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, + BlkSize, BlkSize, one, (double *)aa.data(), + aa.stride_0(), (double *)bb.data(), bb.stride_0(), + one, (double *)cc.data(), cc.stride_0()); + } else if (std::is_same >::value) { + cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, BlkSize, + BlkSize, BlkSize, (void *)&one, (void *)aa.data(), + aa.stride_0(), (void *)bb.data(), bb.stride_0(), + (void *)&one, (void *)cc.data(), cc.stride_0()); + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + std::cout << std::setw(12) << "MKL DGEMM" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + + cref = c; + } + } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) - { - typedef Kokkos::View ViewType; - ViewType - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, BlkSize), - c("c", N*VectorLength, BlkSize, BlkSize); - - value_type - *aa[N*VectorLength], - *bb[N*VectorLength], - *cc[N*VectorLength]; - - for (int k=0;k + ViewType; + ViewType a("a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, BlkSize), + c("c", N * VectorLength, BlkSize, BlkSize); + + value_type *aa[N * VectorLength], *bb[N * VectorLength], + *cc[N * VectorLength]; + + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = &a(k, 0, 0); + bb[k] = &b(k, 0, 0); + cc[k] = &c(k, 0, 0); + } - { - double tavg = 0, tmin = tmax; - - MKL_INT blksize[1] = { BlkSize }; - MKL_INT lda[1] = { a.stride_1() }; - MKL_INT ldb[1] = { b.stride_1() }; - MKL_INT ldc[1] = { c.stride_1() }; - - CBLAS_TRANSPOSE transA[1] = { CblasNoTrans }; - CBLAS_TRANSPOSE transB[1] = { CblasNoTrans }; - - double one[1] = { 1.0 }; - MKL_INT size_per_grp[1] = { N*VectorLength }; - - for (int iter=iter_begin;iter::value) { - cblas_dgemm_batch(CblasRowMajor, - transA, - transB, - blksize, blksize, blksize, - one, - (const double**)aa, lda, - (const double**)bb, ldb, - one, - (double**)cc, ldc, - 1, size_per_grp); - } else if (std::is_same >::value) { - cblas_zgemm_batch(CblasRowMajor, - transA, - transB, - blksize, blksize, blksize, - one, - (const void**)aa, lda, - (const void**)bb, ldb, - one, - (void**)cc, ldc, - 1, size_per_grp); - } - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i zone(1.0); - - MKL_COMPACT_PACK format; - if (std::is_same::value) { - if (VectorLength == 4) format = MKL_COMPACT_AVX; - else if (VectorLength == 8) format = MKL_COMPACT_AVX512; - } else if (std::is_same >::value) { - if (VectorLength == 2) format = MKL_COMPACT_AVX; - else if (VectorLength == 4) format = MKL_COMPACT_AVX512; - } + { + Kokkos::View a( + "a", N, BlkSize, BlkSize), + b("b", N, BlkSize, BlkSize), c("c", N, BlkSize, BlkSize); + + { + double tavg = 0, tmin = tmax; + + double done(1.0); + std::complex zone(1.0); + + MKL_COMPACT_PACK format; + if (std::is_same::value) { + if (VectorLength == 4) + format = MKL_COMPACT_AVX; + else if (VectorLength == 8) + format = MKL_COMPACT_AVX512; + } else if (std::is_same >::value) { + if (VectorLength == 2) + format = MKL_COMPACT_AVX; + else if (VectorLength == 4) + format = MKL_COMPACT_AVX512; + } - if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) { - for (int iter=iter_begin;iter::value) { - mkl_dgemm_compact(MKL_ROW_MAJOR, - MKL_NOTRANS, MKL_NOTRANS, - BlkSize, BlkSize, BlkSize, - done, - (const double*)a.data(), (MKL_INT)a.stride_1(), - (const double*)b.data(), (MKL_INT)b.stride_1(), - done, - ( double*)c.data(), (MKL_INT)c.stride_1(), - format, N*VectorLength); - } else if (std::is_same >::value) { - mkl_zgemm_compact(MKL_ROW_MAJOR, - MKL_NOTRANS, MKL_NOTRANS, - BlkSize, BlkSize, BlkSize, - (MKL_Complex16*)&zone, - (const double*)a.data(), (MKL_INT)a.stride_1(), - (const double*)b.data(), (MKL_INT)b.stride_1(), - (MKL_Complex16*)&zone, - ( double*)c.data(), (MKL_INT)c.stride_1(), - format, N*VectorLength); - } - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy(0, N*VectorLength); - - double tavg = 0, tmin = tmax; - - // adjust column major order in xsmm - char transA = 'N', transB = 'N'; - libxsmm_blasint blksize = BlkSize; - double one = 1.0; - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - // adjust transpose - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy( + 0, N * VectorLength); + + double tavg = 0, tmin = tmax; + + // adjust column major order in xsmm + char transA = 'N', transB = 'N'; + libxsmm_blasint blksize = BlkSize; + double one = 1.0; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::libxswmmOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + // column major + libxsmm_gemm((const char *)&transA, (const char *)&transB, + blksize, blksize, blksize, (const double *)&one, + (const double *)bb.data(), + (const libxsmm_blasint *)&ldb, + (const double *)aa.data(), + (const libxsmm_blasint *)&lda, (const double *)&one, + (double *)cc.data(), (const libxsmm_blasint *)&ldc); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + // adjust transpose + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += abs(cref(i, j, k) - c(i, j, k)); + + std::cout << std::setw(12) << "libxsmm" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; + } + libxsmm_finalize(); + } #endif - /// - /// Do not test this. Test Compact vs MKL - /// KK Scalar version (comparable to micro BLAS version) - /// - // if (!std::is_same::value) { - // Kokkos::View - // a("a", N*VectorLength, BlkSize, BlkSize), - // b("b", N*VectorLength, BlkSize, BlkSize), - // c("c", N*VectorLength, BlkSize, BlkSize); - - // { - // const Kokkos::RangePolicy policy(0, N*VectorLength); - - // double tavg = 0, tmin = tmax; - - // for (int iter=iter_begin;iter:: - // invoke(1.0, aa, bb, 1.0, cc); - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=cref.extent(2);i policy(0, N); - - double tavg = 0, tmin = tmax; - - for (int iter=iter_begin;iter:: - invoke(1.0, aa, bb, 1.0, cc); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=cref.extent(0);i policy(0, + // N*VectorLength); + + // double tavg = 0, tmin = tmax; + + // for (int iter=iter_begin;iter:: + // invoke(1.0, aa, bb, 1.0, cc); + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=cref.extent(2);i policy(0, N); + + double tavg = 0, tmin = tmax; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_simd); + Kokkos::deep_copy(b, bmat_simd); + Kokkos::deep_copy(c, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemmHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + auto cc = Kokkos::subview(c, k, Kokkos::ALL(), Kokkos::ALL()); + + SerialGemm::invoke(1.0, aa, bb, 1.0, cc); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - std::cout << std::endl; + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = cref.extent(0); i < iend; ++i) + for (int j = 0, jend = cref.extent(1); j < jend; ++j) + for (int k = 0, kend = cref.extent(2); k < kend; ++k) + diff += abs(cref(i, j, k) - + c(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(12) << "KK Vector" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } - - } // end perftest -} // end batched + } + std::cout << std::endl; +} +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp index 2fffa06855..484c519b1c 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Complex.cpp @@ -6,7 +6,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -19,35 +19,35 @@ void run(const int N) { // Test::Gemm<32, AlgoTagType>(N); // Test::Gemm<64, AlgoTagType>(N); - PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N); - PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + int N[1] = {128 * 128}; - for (int i=1;i(N[i]); - + std::cout << "\n Testing Algo::Gemm::Blocked\n"; run(N[i]); @@ -55,7 +55,6 @@ int main(int argc, char *argv[]) { std::cout << "\n Testing Algo::Gemm::CompactMKL\n"; run(N[i]); #endif - } } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp index 031909d540..b062942341 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemm_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -18,32 +18,31 @@ void run(const int N) { // Test::Gemm<32, AlgoTagType>(N); // Test::Gemm<64, AlgoTagType>(N); - PerfTest::Gemm< 3, HostSpaceType, AlgoTagType>(N); - PerfTest::Gemm< 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<3, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemm<5, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<10, HostSpaceType, AlgoTagType>(N); PerfTest::Gemm<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + int N[1] = {128 * 128}; - for (int i=1;i(N[i]); - + std::cout << "\n Testing Algo::Gemm::Blocked\n"; run(N[i]); @@ -51,7 +50,6 @@ int main(int argc, char *argv[]) { std::cout << "\n Testing Algo::Gemm::CompactMKL\n"; run(N[i]); #endif - } } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp index 0a45a0b56b..9480b810ba 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host.hpp @@ -22,267 +22,271 @@ #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD -#if defined( KokkosBatched_Test_Gemv_Host_Complex ) +#if defined(KokkosBatched_Test_Gemv_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Gemv_Host_Real ) +#if defined(KokkosBatched_Test_Gemv_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(m*n) + - FLOP_ADD*(m*n)); - } - - template - void Gemv(const int NN) { - typedef Kokkos::Schedule ScheduleType; - //typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (m * n) + FLOP_ADD * (m * n)); +} + +template +void Gemv(const int NN) { + typedef Kokkos::Schedule ScheduleType; + // typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } + } + + const double flop = + (N * VectorLength) * FlopCount(BlkSize, BlkSize) * NumVecs; + // const double tmax = 1.0e15; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize)*NumVecs; - //const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - Kokkos::View yref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize); - Kokkos::View - xvec("xvec", N*VectorLength, NumVecs, BlkSize); - - Kokkos::Random_XorShift64_Pool random(13718); - Kokkos::fill_random(xvec, random, value_type(1.0)); - Kokkos::fill_random(amat, random, value_type(1.0)); - - // for KNL - constexpr size_t LLC_CAPACITY = 34*1024*1024; - Flush flush; - - /// - /// Reference version using MKL DGEMM - /// + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + Kokkos::View yref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize); + Kokkos::View xvec( + "xvec", N * VectorLength, NumVecs, BlkSize); + + Kokkos::Random_XorShift64_Pool random(13718); + Kokkos::fill_random(xvec, random, value_type(1.0)); + Kokkos::fill_random(amat, random, value_type(1.0)); + + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; + + /// + /// Reference version using MKL DGEMM + /// #if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - x("x", N*VectorLength, NumVecs, BlkSize), - y("y", N*VectorLength, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double t = 0; - for (int iter=iter_begin;iter= 0)*timer.seconds(); - } - t /= iter_end; - - std::cout << std::setw(12) << "MKL DGEMV" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumVecs = " << std::setw(3) << NumVecs - << " time = " << std::scientific << t - << " flop/s = " << (flop/t) - << std::endl; - - yref = y; - } + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + x("x", N * VectorLength, NumVecs, BlkSize), + y("y", N * VectorLength, NumVecs, BlkSize); + + { + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(x, xvec); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::CblasOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + cblas_dgemv(CblasRowMajor, CblasNoTrans, BlkSize, BlkSize, 1.0, + (double*)aa.data(), aa.stride_0(), + (double*)xx.data(), xx.stride_0(), 1.0, + (double*)yy.data(), yy.stride_0()); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); } + t /= iter_end; + + std::cout << std::setw(12) << "MKL DGEMV" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << std::endl; + + yref = y; + } + } #endif - - /// - /// Plain version (comparable to micro BLAS version) - /// - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - x("x", N*VectorLength, NumVecs, BlkSize), - y("y", N*VectorLength, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N*VectorLength); - - double t = 0; - for (int iter=iter_begin;iter:: - invoke(1.0, aa, xx, 1.0, yy); - } - }); - - HostSpaceType().fence(); - t += (iter >= 0)*timer.seconds(); - } - t /= iter_end; - - double diff = 0; - for (int i=0,iend=yref.extent(0);i policy( + 0, N * VectorLength); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(x, xvec); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::SerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + SerialGemv::invoke(1.0, aa, xx, + 1.0, yy); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); + } + t /= iter_end; + + double diff = 0; + for (int i = 0, iend = yref.extent(0); i < iend; ++i) + for (int j = 0, jend = yref.extent(1); j < jend; ++j) + for (int k = 0, kend = yref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(yref(i, j, k) - + y(i, j, k)); + + std::cout << std::setw(12) << "Plain" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << " diff to ref = " << diff + << std::endl; + } + } + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + xvec_simd("xvec_simd", N, NumVecs, BlkSize); + + for (int k0 = 0; k0 < N; ++k0) + for (int k1 = 0; k1 < VectorLength; ++k1) + for (int i = 0; i < BlkSize; ++i) { + for (int j = 0; j < NumVecs; ++j) + xvec_simd(k0, j, i)[k1] = xvec(k0 * VectorLength + k1, j, i); + for (int j = 0; j < BlkSize; ++j) + amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - xvec_simd("xvec_simd", N, NumVecs, BlkSize); - - for (int k0=0;k0 - a("a", N, BlkSize, BlkSize), - x("x", N, NumVecs, BlkSize), - y("y", N, NumVecs, BlkSize); - - { - const Kokkos::RangePolicy policy(0, N); - - double t = 0; - for (int iter=iter_begin;iter:: - invoke(1.0, aa, xx, 1.0, yy); - } - }); - - HostSpaceType().fence(); - t += (iter >= 0)*timer.seconds(); - } - t /= iter_end; - - double diff = 0; - for (int i=0,iend=yref.extent(0);i policy(0, N); + + double t = 0; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_simd); + Kokkos::deep_copy(x, xvec_simd); + Kokkos::deep_copy(y, 0); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::GemvHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + for (int j = 0; j < NumVecs; ++j) { + auto xx = Kokkos::subview(x, k, j, Kokkos::ALL()); + auto yy = Kokkos::subview(y, k, j, Kokkos::ALL()); + + SerialGemv::invoke(1.0, aa, xx, + 1.0, yy); + } + }); + + HostSpaceType().fence(); + t += (iter >= 0) * timer.seconds(); } + t /= iter_end; + + double diff = 0; + for (int i = 0, iend = yref.extent(0); i < iend; ++i) + for (int j = 0, jend = yref.extent(1); j < jend; ++j) + for (int k = 0, kend = yref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs( + yref(i, j, k) - y(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(12) << "Serial SIMD" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumVecs = " << std::setw(3) << NumVecs + << " time = " << std::scientific << t + << " flop/s = " << (flop / t) << " diff to ref = " << diff + << std::endl; } - } } + +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp index 56ade7a446..75f4bca4c0 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Gemv_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -18,33 +18,32 @@ void run(const int N) { // PerfTest::Gemv<32, 1, ExecSpace,AlgoTagType>(N); // PerfTest::Gemv<64, 1, ExecSpace,AlgoTagType>(N); - PerfTest::Gemv< 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv< 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv<10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Gemv<15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Gemv<3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Gemv<15, 1, HostSpaceType, AlgoTagType>(N); } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) const int ntest = 1; - //const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; - const int N[1] = { 128*128 }; + // const int N[6] = { 256, 512, 768, 1024, 1280, 1536 }; + const int N[1] = {128 * 128}; - { - for (int i=0;i(N[i]); - + std::cout << "\n Testing Algo::Gemv::Blocked\n"; run(N[i]); } } #endif Kokkos::finalize(); - + return 0; } diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp index dcd60af9f0..6cf9ec5725 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Cuda.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -23,551 +23,565 @@ #include "KokkosBatched_LU_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { - +namespace PerfTest { + #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; - - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - if (m > n) - return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) + - FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+ (1.0/6.0)*n)); - else - return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) + - FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+ (1.0/6.0)*m)); - } +typedef double value_type; + +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + if (m > n) + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - + 0.5 * m * n + (1.0 / 6.0) * n)); + else + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - + 0.5 * n * m + (1.0 / 6.0) * m)); +} - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; - template - struct Functor { - UnmanagedViewType _a; +template +struct Functor { + UnmanagedViewType _a; - KOKKOS_INLINE_FUNCTION - Functor() = default; + KOKKOS_INLINE_FUNCTION + Functor() = default; - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a) - : _a(a) {} + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a) : _a(a) {} - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - SerialLU::invoke(aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + SerialLU::invoke(aa); + } + }); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - TeamLU::invoke(member, aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + TeamLU::invoke(member, aa); + } + }); + } - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < _a.extent_int(0)) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - TeamLU::invoke(member, saa); - member.team_barrier(); - TeamCopy::invoke(member, saa, aa); - } - }); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < _a.extent_int(0)) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + TeamLU::invoke(member, saa); + member.team_barrier(); + TeamCopy::invoke(member, saa, aa); + } + }); + } +}; - }; - - template - void LU(const int NN, const int BlkSize) { - typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } +template +void LU(const int NN, const int BlkSize) { + typedef Kokkos::Schedule ScheduleType; - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); - const double tmax = 1.0e15; - - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - - const int iter_begin = -3, iter_end = 50; - Kokkos::Timer timer; - - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - aref("aref", N*VectorLength, BlkSize, BlkSize); - - { - Random random; - for (int k=0;k::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); + const double tmax = 1.0e15; - // value_type d[BlkSize], v[BlkSize][BlkSize]; - // for (int i=0;i amat( + "amat", N * VectorLength, BlkSize, BlkSize), + aref("aref", N * VectorLength, BlkSize, BlkSize); + + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) { + // use tridiagonal matrices; for now we just check elementwise l/u factors + // do not allow pivots + for (int i = 0; i < BlkSize; ++i) { + amat(k, i, i) = random.value() + 10.0; + if ((i + 1) < BlkSize) { + amat(k, i, i + 1) = random.value() + 1.0; + amat(k, i + 1, i) = random.value() + 1.0; } } - constexpr size_t LLC_CAPACITY = 56*4*1024*1024; - Flush flush; - + // value_type d[BlkSize], v[BlkSize][BlkSize]; + // for (int i=0;i flush; + #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Batch version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); - - Kokkos::View a("a", stride); - Kokkos::View info("info", N*VectorLength); - - cublasStatus_t stat; - cublasHandle_t handle; - - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); - - auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); - Kokkos::deep_copy(amat_device, amat); + if (1) { + /// + /// CUBLAS Batch version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride); + Kokkos::View info("info", N * VectorLength); + + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); + + auto amat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), amat); + Kokkos::deep_copy(amat_device, amat); + + Kokkos::fence(); + { + double tavg = 0, tmin = tmax; + value_type *aa[N * VectorLength]; + + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = a.data() + k * a.stride_0(); + } + value_type **aa_device; + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess) { + Kokkos::abort("CUDA memory allocation failed\n"); + } + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess) { + Kokkos::abort("CUDA memcpy failed\n"); + } + Kokkos::fence(); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat_device); Kokkos::fence(); - { - double tavg = 0, tmin = tmax; - value_type *aa[N*VectorLength]; + timer.reset(); - for (int k=0;k= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - Kokkos::deep_copy(aref, asol); - - if (cudaFree(aa_device) != cudaSuccess) { - Kokkos::abort("CUDA memory free failed\n"); - } - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Batch" - << " BlkSize = " << std::setw(3) << BlkSize - << " TeamSize = N/A" - << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; + stat = cublasDgetrfBatched(handle, BlkSize, (value_type **)aa_device, + BlkSize, NULL, (int *)info.data(), + N * VectorLength); + if (stat != CUBLAS_STATUS_SUCCESS) { + Kokkos::abort("CUBLAS LU Batched failed\n"); } + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto asol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + Kokkos::deep_copy(asol, a); + Kokkos::deep_copy(aref, asol); + + if (cudaFree(aa_device) != cudaSuccess) { + Kokkos::abort("CUDA memory free failed\n"); + } + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batch" + << " BlkSize = " << std::setw(3) << BlkSize << " TeamSize = N/A" + << " ScratchSize (KB) = N/A" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); + if (1) { + /// + /// Range policy version + /// + typedef Kokkos::View view_type; + view_type a("a", N * VectorLength, BlkSize, BlkSize); - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); + double tavg = 0, tmin = tmax; + { + typedef Functor functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV1", + policy, functor_type(a)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - if (1) { - /// - /// Team V1 - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::LU::Blocked::mb(); + // mp = BlkSize%mb > 0; + + const int + // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; + mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::LUCuda::TeamTagV2", + policy, functor_type(a)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - if (1) { - /// - /// Team V2 - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::LU::Blocked::mb(); - //mp = BlkSize%mb > 0; - - const int - //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; + tavg /= iter_end; + + auto asol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); + Kokkos::deep_copy(asol, a); + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(aref(i, j, k) - + asol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); +#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) + std::cout << " diff to ref = " << diff; +#endif + std::cout << std::endl; + } + } + if (1) { + /// + /// Team V3 + /// + typedef Kokkos::View view_type; + view_type a("a", N * VectorLength, BlkSize, BlkSize); + + double tavg = 0, tmin = tmax; + { + typedef Kokkos::TeamPolicy + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::LU::Blocked::mb(); + // mp = BlkSize%mb > 0; + + const int + // mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*2,1), max_team_size); + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = std::min(std::max(mblk * 2, 1), max_team_size); - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::LU::Blocked::mb(); - // mp = BlkSize%mb > 0; - - const int - //mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - mblk = is_blocked_algo ? (BlkSize - mb) : (BlkSize - 1); - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(mblk*2,1), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto asol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), a); - Kokkos::deep_copy(asol, a); - - double diff = 0; - for (int i=0,iend=aref.extent(0);i +template void run(const int N, const int B) { typedef Kokkos::DefaultExecutionSpace ExecSpace; Kokkos::print_configuration(std::cout, false); if (B != 0) { - PerfTest::LU(N,B); + PerfTest::LU(N, B); } else { PerfTest::LU(N, 3); PerfTest::LU(N, 5); - PerfTest::LU(N,10); - PerfTest::LU(N,15); + PerfTest::LU(N, 10); + PerfTest::LU(N, 15); } } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0; + int N = 128 * 128, B = 0; - for (int i=1;i(N,B); - + run(N, B); + std::cout << "\n Testing LayoutLeft Algo::LU::Blocked\n"; - run(N,B); + run(N, B); } Kokkos::finalize(); diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp index 33cbd78b6c..68daa24eb1 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host.hpp @@ -20,312 +20,324 @@ #undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD - // no complex yet -#if defined( KokkosBatched_Test_LU_Host_Complex ) +// no complex yet +#if defined(KokkosBatched_Test_LU_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_LU_Host_Real ) +#if defined(KokkosBatched_Test_LU_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCount(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - if (m > n) - return (FLOP_MUL*(0.5*m*n*n-(1.0/6.0)*n*n*n+0.5*m*n-0.5*n*n+(2.0/3.0)*n) + - FLOP_ADD*(0.5*m*n*n-(1.0/6.0)*n*n*n-0.5*m*n+ (1.0/6.0)*n)); - else - return (FLOP_MUL*(0.5*n*m*m-(1.0/6.0)*m*m*m+0.5*n*m-0.5*m*m+(2.0/3.0)*m) + - FLOP_ADD*(0.5*n*m*m-(1.0/6.0)*m*m*m-0.5*n*m+ (1.0/6.0)*m)); - } - - template - void LU(const int NN) { - typedef Kokkos::Schedule ScheduleType; - //typedef Kokkos::Schedule ScheduleType; - - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +double FlopCount(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + if (m > n) + return (FLOP_MUL * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n + + 0.5 * m * n - 0.5 * n * n + (2.0 / 3.0) * n) + + FLOP_ADD * (0.5 * m * n * n - (1.0 / 6.0) * n * n * n - + 0.5 * m * n + (1.0 / 6.0) * n)); + else + return (FLOP_MUL * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m + + 0.5 * n * m - 0.5 * m * m + (2.0 / 3.0) * m) + + FLOP_ADD * (0.5 * n * m * m - (1.0 / 6.0) * m * m * m - + 0.5 * n * m + (1.0 / 6.0) * m)); +} + +template +void LU(const int NN) { + typedef Kokkos::Schedule ScheduleType; + // typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; + +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif + } + + const double flop = (N * VectorLength) * FlopCount(BlkSize, BlkSize); + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + /// + /// Reference version using MKL DGETRF + /// + Kokkos::View aref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize); + + Random random; + + for (int k = 0; k < N * VectorLength; ++k) { + // use tridiagonal matrices; for now we just check elementwise l/u factors + // do not allow pivots + for (int i = 0; i < BlkSize; ++i) { + amat(k, i, i) = random.value() + 10.0; + if ((i + 1) < BlkSize) { + amat(k, i, i + 1) = random.value() + 1.0; + amat(k, i + 1, i) = random.value() + 1.0; } - - const double flop = (N*VectorLength)*FlopCount(BlkSize,BlkSize); - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - /// - /// Reference version using MKL DGETRF - /// - Kokkos::View aref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize); - - Random random; - - for (int k=0;k, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); + + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::Pack", + Kokkos::RangePolicy(0, N * VectorLength), + KOKKOS_LAMBDA(const int k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat_simd(k0, i, j)[k1] = amat(k0 * VectorLength + k1, i, j); } - } - } + }); - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize); //, a("a", N, BlkSize, BlkSize); - - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::Pack", - Kokkos::RangePolicy(0, N*VectorLength), - KOKKOS_LAMBDA(const int k) { - const int k0 = k/VectorLength, k1 = k%VectorLength; - for (int i=0;i flush; - - /// - /// Reference version using MKL DGETRF - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View a("a", N*VectorLength, BlkSize, BlkSize); - Kokkos::View p("p", N*VectorLength, BlkSize); - { - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter policy(0, N*VectorLength); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto pp = Kokkos::subview(p, k, Kokkos::ALL()); - LAPACKE_dgetrf(LAPACK_ROW_MAJOR, - BlkSize, BlkSize, - (double*)aa.data(), aa.stride_0(), - (int*)pp.data()); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - std::cout << std::setw(10) << "MKL LU" - << " BlkSize = " << std::setw(3) << BlkSize - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; - } + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - aref = a; + /// + /// Reference version using MKL DGETRF + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize); + Kokkos::View p( + "p", N * VectorLength, BlkSize); + { + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrix + Kokkos::deep_copy(a, amat); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::RangePolicy policy( + 0, N * VectorLength); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::LAPACKE_dgetrfOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto pp = Kokkos::subview(p, k, Kokkos::ALL()); + LAPACKE_dgetrf(LAPACK_ROW_MAJOR, BlkSize, BlkSize, + (double*)aa.data(), aa.stride_0(), + (int*)pp.data()); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + std::cout << std::setw(10) << "MKL LU" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + + aref = a; + } #if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) #endif #if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) - { - Kokkos::View - a("a", N, BlkSize, BlkSize); - - { - double tavg = 0, tmin = tmax; - MKL_COMPACT_PACK format; - if (VectorLength == 8) format = MKL_COMPACT_AVX512; - else if (VectorLength == 4) format = MKL_COMPACT_AVX; - - if (format == MKL_COMPACT_AVX512 || format == MKL_COMPACT_AVX) { - int info; - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=aref.extent(0);i= 0) * t; } + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += abs(aref(i, j, k) - + a(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(10) << "MKL Cmpt" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } + } + } #endif #endif - // /// - // /// Plain version (comparable to micro BLAS version) - // /// - - // { - // Kokkos::View - // a("a", N*VectorLength, BlkSize, BlkSize); - - // { - // double tavg = 0, tmin = tmax; - // for (int iter=iter_begin;iter policy(0, N*VectorLength); - // Kokkos::parallel_for - // (policy, - // KOKKOS_LAMBDA(const int k) { - // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - - // SerialLU::invoke(aa); - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=aref.extent(0);i policy(0, N); - Kokkos::parallel_for("KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - - SerialLU::invoke(aa); - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=aref.extent(0);i policy(0, + // N*VectorLength); Kokkos::parallel_for + // (policy, + // KOKKOS_LAMBDA(const int k) { + // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + // SerialLU::invoke(aa); + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=aref.extent(0);i policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::LUHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + + SerialLU::invoke(aa); + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = aref.extent(0); i < iend; ++i) + for (int j = 0, jend = aref.extent(1); j < jend; ++j) + for (int k = 0, kend = aref.extent(2); k < kend; ++k) + diff += abs(aref(i, j, k) - + a(i / VectorLength, j, k)[i % VectorLength]); + std::cout << std::setw(10) << "SIMD" + << " BlkSize = " << std::setw(3) << BlkSize + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } + } +} - } // namespace PerfTest -} // namespace KokkosBatched - - +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp index 7d352283c6..6c0736501d 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_LU_Host_Real.cpp @@ -5,36 +5,35 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; Kokkos::print_configuration(std::cout, false); - PerfTest::LU< 3, HostSpaceType,AlgoTagType>(N); - PerfTest::LU< 5, HostSpaceType,AlgoTagType>(N); - PerfTest::LU<10, HostSpaceType,AlgoTagType>(N); - PerfTest::LU<15, HostSpaceType,AlgoTagType>(N); + PerfTest::LU<3, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<5, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<10, HostSpaceType, AlgoTagType>(N); + PerfTest::LU<15, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) - int N = 128*128; + int N = 128 * 128; - for (int i=1;i(N); - + std::cout << "\n Testing Algo::LU::Blocked\n"; run(N); diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp index 807b7a884e..6000bc7c9d 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Cuda.cpp @@ -3,7 +3,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_Timer.hpp" -#if defined(KOKKOS_ENABLE_CUDA) +#if defined(KOKKOS_ENABLE_CUDA) #include @@ -24,755 +24,746 @@ #include "KokkosBatched_Trsm_Team_Impl.hpp" namespace KokkosBatched { - namespace PerfTest { - +namespace PerfTest { + #undef FLOP_MUL #undef FLOP_ADD #define FLOP_MUL 1.0 #define FLOP_ADD 1.0 - typedef double value_type; +typedef double value_type; - double FlopCountLower(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - double FlopCountUpper(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); +double FlopCountLower(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} + +double FlopCountUpper(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} + +struct RangeTag {}; +struct TeamTagV1 {}; +struct TeamTagV2 {}; +struct TeamTagV3 {}; +struct TeamTagHandmade {}; + +template +struct Functor { + ConstUnmanagedViewType _a; + UnmanagedViewType _b; + + KOKKOS_INLINE_FUNCTION + Functor() = default; + + KOKKOS_INLINE_FUNCTION + Functor(const ViewType &a, const ViewType &b) : _a(a), _b(b) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const RangeTag &, const int k) const { + auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; } + } - struct RangeTag {}; - struct TeamTagV1 {}; - struct TeamTagV2 {}; - struct TeamTagV3 {}; - struct TeamTagHandmade {}; - - template - struct Functor { - ConstUnmanagedViewType _a; - UnmanagedViewType _b; - - KOKKOS_INLINE_FUNCTION - Functor() = default; - - KOKKOS_INLINE_FUNCTION - Functor(const ViewType &a, - const ViewType &b) - : _a(a), _b(b) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const RangeTag &, const int k) const { - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), Kokkos::ALL()); + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV1 &, + const MemberType &member) const { + const int kbeg = + (member.league_rank() * (member.team_size() * VectorLength) + + member.team_rank() * VectorLength); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - } - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV1 &, const MemberType &member) const { - const int kbeg = (member.league_rank()*(member.team_size()*VectorLength) + - member.team_rank()*VectorLength); - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); break; case 1: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 2: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 3: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; case 4: - SerialTrsm:: - invoke(1.0, aa, bb); + SerialTrsm::invoke(1.0, aa, bb); break; - } } - }); - } - - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV2 &, const MemberType &member) const { - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV2 &, + const MemberType &member) const { + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 1: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 2: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 3: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; case 4: - TeamTrsm:: - invoke(member, 1.0, aa, bb); + TeamTrsm::invoke(member, 1.0, aa, bb); break; - } } - }); - } + } + }); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamTagV3 &, + const MemberType &member) const { + const int lvl = 0; + ScratchViewType sa(member.team_scratch(lvl), VectorLength, + _a.extent(1), _a.extent(2)); + // ScratchViewType sb(member.team_scratch(lvl), VectorLength, + // _b.extent(1), _b.extent(2)); - template - KOKKOS_INLINE_FUNCTION - void operator()(const TeamTagV3 &, const MemberType &member) const { - const int lvl = 0; - ScratchViewType sa(member.team_scratch(lvl), VectorLength, _a.extent(1), _a.extent(2)); - //ScratchViewType sb(member.team_scratch(lvl), VectorLength, _b.extent(1), _b.extent(2)); - - const int kbeg = member.league_rank()*VectorLength; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, VectorLength), - [&](const int &k) { - const int kk = kbeg + k; - if (kk < int(_b.extent(0))) { - auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); - - auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); - - TeamCopy::invoke(member, aa, saa); - member.team_barrier(); - - switch (test) { - case 0: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + const int kbeg = member.league_rank() * VectorLength; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, VectorLength), [&](const int &k) { + const int kk = kbeg + k; + if (kk < int(_b.extent(0))) { + auto aa = Kokkos::subview(_a, kk, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(_b, kk, Kokkos::ALL(), Kokkos::ALL()); + + auto saa = Kokkos::subview(sa, k, Kokkos::ALL(), Kokkos::ALL()); + + TeamCopy::invoke(member, aa, saa); + member.team_barrier(); + + switch (test) { + case 0: + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 1: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 2: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 3: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; case 4: - TeamTrsm:: - invoke(member, 1.0, saa, bb); + TeamTrsm::invoke(member, 1.0, saa, bb); break; - } } - }); - } + } + }); + } +}; - }; +template +void Trsm(const int NN, const int BlkSize, const int NumCols) { + typedef Kokkos::Schedule ScheduleType; + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; - template - void Trsm(const int NN, const int BlkSize, const int NumCols) { - typedef Kokkos::Schedule ScheduleType; + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; - - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; - - std::cout << "SIMD is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; - } + std::cout << "SIMD is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; + } - switch (test) { - case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; - case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; - case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; - case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; - case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; - } + switch (test) { + case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; + case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; + case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; + case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; + case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + } - // when m == n, lower upper does not matter (unit and nonunit) - double flop = 0; - switch (test) { - case 0: - case 1: - flop = FlopCountLower(BlkSize,NumCols); - break; - case 2: - case 3: - case 4: - flop = FlopCountUpper(BlkSize,NumCols); - break; - } - flop *= (N*VectorLength); - const double tmax = 1.0e15; - - typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; - typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - - const int iter_begin = -3, iter_end = 30; - Kokkos::Timer timer; - - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, NumCols), - bref("bmat", N*VectorLength, BlkSize, NumCols); - - { - Random random; - for (int k=0;k flush; + // when m == n, lower upper does not matter (unit and nonunit) + double flop = 0; + switch (test) { + case 0: + case 1: flop = FlopCountLower(BlkSize, NumCols); break; + case 2: + case 3: + case 4: flop = FlopCountUpper(BlkSize, NumCols); break; + } + flop *= (N * VectorLength); + const double tmax = 1.0e15; -#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - if (1) { - /// - /// CUBLAS Batch version - /// - const Kokkos::LayoutStride stride(N*VectorLength, BlkSize*BlkSize, - BlkSize, 1, - BlkSize, BlkSize); + typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; + typedef typename DeviceSpaceType::memory_space DeviceMemorySpaceType; - Kokkos::View - a("a", stride), - b("b", stride); + const int iter_begin = -3, iter_end = 30; + Kokkos::Timer timer; - cublasStatus_t stat; - cublasHandle_t handle; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols), + bref("bmat", N * VectorLength, BlkSize, NumCols); - stat = cublasCreate(&handle); - if (stat != CUBLAS_STATUS_SUCCESS) - Kokkos::abort("CUBLAS initialization failed\n"); + { + Random random; + for (int k = 0; k < N * VectorLength; ++k) { + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) + amat(k, i, j) = random.value() + 4.0 * (i == j); + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < NumCols; ++j) bmat(k, i, j) = random.value(); + } + } - auto amat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), amat); - auto bmat_device = Kokkos::create_mirror_view(typename DeviceSpaceType::memory_space(), bmat); + // P100 L2 cache 4MB per core + constexpr size_t LLC_CAPACITY = 56 * 4 * 1024 * 1024; + Flush flush; - Kokkos::deep_copy(amat_device, amat); - Kokkos::deep_copy(bmat_device, bmat); +#if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) + if (1) { + /// + /// CUBLAS Batch version + /// + const Kokkos::LayoutStride stride(N * VectorLength, BlkSize * BlkSize, + BlkSize, 1, BlkSize, BlkSize); + + Kokkos::View a( + "a", stride), + b("b", stride); + + cublasStatus_t stat; + cublasHandle_t handle; + + stat = cublasCreate(&handle); + if (stat != CUBLAS_STATUS_SUCCESS) + Kokkos::abort("CUBLAS initialization failed\n"); + + auto amat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), amat); + auto bmat_device = Kokkos::create_mirror_view( + typename DeviceSpaceType::memory_space(), bmat); + + Kokkos::deep_copy(amat_device, amat); + Kokkos::deep_copy(bmat_device, bmat); + + Kokkos::fence(); + + const double one(1.0); //, zero(0.0); + { + double tavg = 0, tmin = tmax; + value_type *aa[N * VectorLength], *bb[N * VectorLength]; + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = a.data() + k * a.stride_0(); + bb[k] = b.data() + k * b.stride_0(); + } + value_type **aa_device, **bb_device; + if (cudaMalloc(&aa_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess || + cudaMalloc(&bb_device, N * VectorLength * sizeof(value_type *)) != + cudaSuccess) { + Kokkos::abort("CUDA memory allocation failed\n"); + } + if (cudaMemcpy(aa_device, aa, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess || + cudaMemcpy(bb_device, bb, sizeof(value_type *) * N * VectorLength, + cudaMemcpyHostToDevice) != cudaSuccess) { + Kokkos::abort("CUDA memcpy failed\n"); + } + Kokkos::fence(); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); - Kokkos::fence(); + // initialize matrices + Kokkos::deep_copy(a, amat_device); + Kokkos::deep_copy(b, bmat_device); - const double one(1.0); //, zero(0.0); - { - double tavg = 0, tmin = tmax; - value_type - *aa[N*VectorLength], - *bb[N*VectorLength]; - for (int k=0;k= 0)*t; + case 3: { + // Right, Upper, NoTrans, NonUnitDiag + stat = cublasDtrsmBatched( + handle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, + (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); + break; } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - Kokkos::deep_copy(bref, bsol); - - if (cudaFree(aa_device) != cudaSuccess || - cudaFree(bb_device) != cudaSuccess) { - Kokkos::abort("CUDA memory free failed\n"); + case 4: { + // Left, Upper, NoTrans, NonUnitDiag + stat = cublasDtrsmBatched( + handle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, BlkSize, NumCols, &one, + (const value_type **)aa_device, BlkSize, + (value_type **)bb_device, BlkSize, N * VectorLength); + break; } - - std::cout << std::setw(8) << "CUBLAS" - << std::setw(8) << "Batched" - << " BlkSize = " << std::setw(3) << BlkSize - << " NumCols = " << std::setw(3) << NumCols - << " TeamSize = N/A" - << " ScratchSize (KB) = N/A" - << " time = " << std::scientific << tmin - << " avg flop/s = " << (flop/tavg) - << " max flop/s = " << (flop/tmin) - << std::endl; } - cublasDestroy(handle); + + if (stat != CUBLAS_STATUS_SUCCESS) { + Kokkos::abort("CUBLAS Trsm Batched failed\n"); + } + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + Kokkos::deep_copy(bref, bsol); + + if (cudaFree(aa_device) != cudaSuccess || + cudaFree(bb_device) != cudaSuccess) { + Kokkos::abort("CUDA memory free failed\n"); + } + + std::cout << std::setw(8) << "CUBLAS" << std::setw(8) << "Batched" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " ScratchSize (KB) = N/A" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << std::endl; + } + cublasDestroy(handle); + } #endif - if (1) { - /// - /// Range policy version - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Functor functor_type; - const Kokkos::RangePolicy policy(0, N*VectorLength); - - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i functor_type; + const Kokkos::RangePolicy policy( + 0, N * VectorLength); + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::RangeTag", policy, + functor_type(a, b)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - + bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Range" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols << " TeamSize = N/A" + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V1 - almost same scheduling with range policy - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int team_size = - policy_type(N/32, Kokkos::AUTO, VectorLength).team_size_recommended(functor_type(), Kokkos::ParallelForTag()); - - const policy_type policy(N/team_size, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int team_size = + policy_type(N / 32, Kokkos::AUTO, VectorLength) + .team_size_recommended(functor_type(), Kokkos::ParallelForTag()); + + const policy_type policy(N / team_size, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + Kokkos::fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV1", policy, + functor_type(a, b)); + + Kokkos::fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - + bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V1" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V2 - team parallel - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size); - - const policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = + std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + + const policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + DeviceSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV2", policy, + functor_type(a, b)); + + DeviceSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = + Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - + bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V2" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = 0" + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } - } + std::cout << std::endl; + } + } - if (1) { - /// - /// Team policy V3 - team parallel + sratch - /// - typedef Kokkos::View view_type; - view_type - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - double tavg = 0, tmin = tmax; - { - typedef Kokkos::TeamPolicy policy_type; - typedef Functor functor_type; - - const int lvl = 0, per_team_scratch - = ScratchViewType::shmem_size(VectorLength, BlkSize, BlkSize); - - if (per_team_scratch/1024 < 48) { - const int - is_blocked_algo = (std::is_same::value), - mb = Algo::Trsm::Blocked::mb(), - mp = BlkSize%mb > 0; - - const int - mblk = is_blocked_algo ? (BlkSize/mb + mp) : BlkSize; - - const int max_team_size = - policy_type(N, Kokkos::AUTO, VectorLength).set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) - .team_size_max(functor_type(), Kokkos::ParallelForTag()); - const int team_size = std::min(std::max(NumCols,(mblk-1)*mblk), max_team_size); - - policy_type policy(N, team_size, VectorLength); - for (int iter=iter_begin;iter= 0)*t; - } - tavg /= iter_end; - - auto bsol = Kokkos::create_mirror_view(typename HostSpaceType::memory_space(), b); - Kokkos::deep_copy(bsol, b); - - double diff = 0; - for (int i=0,iend=bref.extent(0);i + policy_type; + typedef Functor functor_type; + + const int lvl = 0, + per_team_scratch = ScratchViewType::shmem_size( + VectorLength, BlkSize, BlkSize); + + if (per_team_scratch / 1024 < 48) { + const int is_blocked_algo = + (std::is_same::value), + mb = Algo::Trsm::Blocked::mb(), + mp = BlkSize % mb > 0; + + const int mblk = is_blocked_algo ? (BlkSize / mb + mp) : BlkSize; + + const int max_team_size = + policy_type(N, Kokkos::AUTO, VectorLength) + .set_scratch_size(lvl, Kokkos::PerTeam(per_team_scratch)) + .team_size_max(functor_type(), Kokkos::ParallelForTag()); + const int team_size = + std::min(std::max(NumCols, (mblk - 1) * mblk), max_team_size); + + policy_type policy(N, team_size, VectorLength); + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + DeviceSpaceType().fence(); + timer.reset(); + + Kokkos::parallel_for("KokkosBatched::PerfTest::TeamTagV3", policy, + functor_type(a, b)); + + DeviceSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; + } + tavg /= iter_end; + + auto bsol = Kokkos::create_mirror_view( + typename HostSpaceType::memory_space(), b); + Kokkos::deep_copy(bsol, b); + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs(bref(i, j, k) - + bsol(i, j, k)); + + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " TeamSize = " << std::setw(3) << team_size + << " ScratchSize (KB) = " << std::setw(3) + << (per_team_scratch / 1024) << " time = " << std::scientific + << tmin << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin); #if defined(__KOKKOSKERNELS_NVIDIA_CUBLAS__) - std::cout << " diff to ref = " << diff; + std::cout << " diff to ref = " << diff; #endif - std::cout << std::endl; - } else { - std::cout << std::setw(8) << "Kokkos" - << std::setw(8) << "Team V3" - << " Scratch per team is too big (KB): " << (per_team_scratch/1024) - << std::endl; - } - } + std::cout << std::endl; + } else { + std::cout << std::setw(8) << "Kokkos" << std::setw(8) << "Team V3" + << " Scratch per team is too big (KB): " + << (per_team_scratch / 1024) << std::endl; } - std::cout << "\n\n"; } } + std::cout << "\n\n"; } +} // namespace PerfTest +} // namespace KokkosBatched using namespace KokkosBatched; -template +template void run(const int N, const int B, const int R) { typedef Kokkos::DefaultExecutionSpace ExecSpace; Kokkos::print_configuration(std::cout, false); if (B != 0 && R != 0) { - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,B,R); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, B, R); } else { - std::cout << "\n\n Used for Factorization \n\n"; /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve) PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 15); /// Left, Lower, NoTrans, NonUnitDiag PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<1, ExecSpace, AlgoTagType>(N, 15, 15); /// Right, Upper, NoTrans, UnitDiag PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<2, ExecSpace, AlgoTagType>(N, 15, 15); /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization) PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 3, 3); PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 5, 5); - PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,10,10); - PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N,15,15); + PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 10, 10); + PerfTest::Trsm<3, ExecSpace, AlgoTagType>(N, 15, 15); std::cout << "\n\n Used for Solve \n\n"; @@ -780,26 +771,25 @@ void run(const int N, const int B, const int R) { PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 3, 1); PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 5, 1); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,10, 1); - PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N,15, 1); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 10, 1); + PerfTest::Trsm<0, ExecSpace, AlgoTagType>(N, 15, 1); /// Left, Upper, Notrans, NonUnitDiag (user in LU solve) PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 3, 1); PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 5, 1); - PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,10, 1); - PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N,15, 1); + PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 10, 1); + PerfTest::Trsm<4, ExecSpace, AlgoTagType>(N, 15, 1); } } int main(int argc, char *argv[]) { - Kokkos::initialize(argc, argv); - int N = 128*128, B = 0, R = 0; + int N = 128 * 128, B = 0, R = 0; - for (int i=1;i(N,B,R); + run(N, B, R); std::cout << "\n Testing LayoutLeft Algo::Trsm::Blocked\n"; - run(N,B,R); + run(N, B, R); } Kokkos::finalize(); @@ -822,7 +812,7 @@ int main(int argc, char *argv[]) { return 0; } -#else +#else int main(int argc, char *argv[]) { std::cout << "Kokkos::Cuda is not enabled\n"; diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp index 0e14fe0cf9..0770055cb0 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host.hpp @@ -18,641 +18,602 @@ //#undef __KOKKOSBATCHED_INTEL_MKL_BATCHED__ namespace KokkosBatched { - namespace PerfTest { +namespace PerfTest { #undef FLOP_MUL #undef FLOP_ADD - // no complex yet -#if defined( KokkosBatched_Test_Trsm_Host_Complex ) +// no complex yet +#if defined(KokkosBatched_Test_Trsm_Host_Complex) #define FLOP_MUL 6.0 #define FLOP_ADD 2.0 - typedef Kokkos::complex value_type; +typedef Kokkos::complex value_type; #endif -#if defined( KokkosBatched_Test_Trsm_Host_Real ) +#if defined(KokkosBatched_Test_Trsm_Host_Real) #define FLOP_MUL 1.0 -#define FLOP_ADD 1.0 - typedef double value_type; +#define FLOP_ADD 1.0 +typedef double value_type; #endif - double FlopCountLower(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - double FlopCountUpper(int mm, int nn) { - double m = (double)mm; double n = (double)nn; - return (FLOP_MUL*(0.5*m*n*(n+1.0)) + - FLOP_ADD*(0.5*m*n*(n-1.0))); - } - - template - void Trsm(const int NN) { - typedef Kokkos::Schedule ScheduleType; +double FlopCountLower(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} - constexpr int VectorLength = DefaultVectorLength::value; - const int N = NN/VectorLength; +double FlopCountUpper(int mm, int nn) { + double m = (double)mm; + double n = (double)nn; + return (FLOP_MUL * (0.5 * m * n * (n + 1.0)) + + FLOP_ADD * (0.5 * m * n * (n - 1.0))); +} - { - std::string value_type_name; - if (std::is_same::value) value_type_name = "double"; - if (std::is_same >::value) value_type_name = "Kokkos::complex"; -#if defined(__AVX512F__) - std::cout << "AVX512 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; +template +void Trsm(const int NN) { + typedef Kokkos::Schedule ScheduleType; + + constexpr int VectorLength = + DefaultVectorLength::value; + const int N = NN / VectorLength; + + { + std::string value_type_name; + if (std::is_same::value) value_type_name = "double"; + if (std::is_same >::value) + value_type_name = "Kokkos::complex"; +#if defined(__AVX512F__) + std::cout << "AVX512 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #elif defined(__AVX__) || defined(__AVX2__) - std::cout << "AVX or AVX2 is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "AVX or AVX2 is defined: datatype " << value_type_name + << " a vector length " << VectorLength << "\n"; #else - std::cout << "SIMD (compiler vectorization) is defined: datatype " << value_type_name << " a vector length " << VectorLength << "\n"; + std::cout << "SIMD (compiler vectorization) is defined: datatype " + << value_type_name << " a vector length " << VectorLength << "\n"; #endif - } + } + + switch (test) { + case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; + case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; + case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; + case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; + case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + } - switch (test) { - case 0: std::cout << "TestID = Left, Lower, NoTrans, UnitDiag\n"; break; - case 1: std::cout << "TestID = Left, Lower, NoTrans, NonUnitDiag\n"; break; - case 2: std::cout << "TestID = Right, Upper, NoTrans, UnitDiag\n"; break; - case 3: std::cout << "TestID = Right, Upper, NoTrans, NonUnitDiag\n"; break; - case 4: std::cout << "TestID = Left, Upper, NoTrans, NonUnitDiag\n"; break; + // when m == n, lower upper does not matter (unit and nonunit) + double flop = 0; + switch (test) { + case 0: + case 1: flop = FlopCountLower(BlkSize, NumCols); break; + case 2: + case 3: + case 4: flop = FlopCountUpper(BlkSize, NumCols); break; + } + flop *= (N * VectorLength); + + const double tmax = 1.0e15; + + const int iter_begin = -10, iter_end = 100; + Kokkos::Timer timer; + + /// + /// Reference version using MKL DTRSM + /// + Kokkos::View bref; + Kokkos::View amat( + "amat", N * VectorLength, BlkSize, BlkSize), + bmat("bmat", N * VectorLength, BlkSize, NumCols); + + typedef Vector, VectorLength> VectorType; + Kokkos::View amat_simd( + "amat_simd", N, BlkSize, BlkSize), + bmat_simd("bmat_simd", N, BlkSize, NumCols); + + Random random; + + for (int k = 0; k < N * VectorLength; ++k) { + const int k0 = k / VectorLength, k1 = k % VectorLength; + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < BlkSize; ++j) { + amat(k, i, j) = random.value() + 4.0 * (i == j); + amat_simd(k0, i, j)[k1] = amat(k, i, j); + } + for (int i = 0; i < BlkSize; ++i) + for (int j = 0; j < NumCols; ++j) { + bmat(k, i, j) = random.value(); + bmat_simd(k0, i, j)[k1] = bmat(k, i, j); } + } + + // for KNL + constexpr size_t LLC_CAPACITY = 34 * 1024 * 1024; + Flush flush; - // when m == n, lower upper does not matter (unit and nonunit) - double flop = 0; - switch (test) { - case 0: - case 1: - flop = FlopCountLower(BlkSize,NumCols); - break; - case 2: - case 3: - case 4: - flop = FlopCountUpper(BlkSize,NumCols); - break; + /// + /// Reference version using MKL DTRSM + /// +#if defined(__KOKKOSBATCHED_INTEL_MKL__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, NumCols); + + { + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::RangePolicy policy( + 0, N * VectorLength); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 1: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasLower, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 2: + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, + CblasNoTrans, CblasUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 3: + cblas_dtrsm(CblasRowMajor, CblasRight, CblasUpper, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + case 4: + cblas_dtrsm(CblasRowMajor, CblasLeft, CblasUpper, + CblasNoTrans, CblasNonUnit, BlkSize, NumCols, 1.0, + (double *)aa.data(), aa.stride_0(), + (double *)bb.data(), bb.stride_0()); + break; + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - flop *= (N*VectorLength); - - const double tmax = 1.0e15; - - const int iter_begin = -10, iter_end = 100; - Kokkos::Timer timer; - - /// - /// Reference version using MKL DTRSM - /// - Kokkos::View bref; - Kokkos::View - amat("amat", N*VectorLength, BlkSize, BlkSize), - bmat("bmat", N*VectorLength, BlkSize, NumCols); - - typedef Vector,VectorLength> VectorType; - Kokkos::View - amat_simd("amat_simd", N, BlkSize, BlkSize), - bmat_simd("bmat_simd", N, BlkSize, NumCols); - - Random random; - - for (int k=0;k::abs(bmat(i, j, k)); + + std::cout << std::setw(10) << "MKL TRSM" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) << " sum abs(B) = " << sum + << std::endl; + + bref = b; + } + } +#if defined(__KOKKOSBATCHED_INTEL_MKL_BATCHED__) + { + Kokkos::View a( + "a", N * VectorLength, BlkSize, BlkSize), + b("b", N * VectorLength, BlkSize, NumCols); + + value_type *aa[N * VectorLength], *bb[N * VectorLength]; + + for (int k = 0; k < N * VectorLength; ++k) { + aa[k] = &a(k, 0, 0); + bb[k] = &b(k, 0, 0); + } + + { + double tavg = 0, tmin = tmax; + + MKL_INT blksize[1] = {BlkSize}; + MKL_INT numcols[1] = {NumCols}; + + MKL_INT lda[1] = {a.stride_1()}; + MKL_INT ldb[1] = {b.stride_1()}; + + double one[1] = {1.0}; + MKL_INT size_per_grp[1] = {N * VectorLength}; + + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat); + Kokkos::deep_copy(b, bmat); + + HostSpaceType().fence(); + timer.reset(); + + switch (test) { + case 0: { + CBLAS_SIDE side[1] = {CblasLeft}; + CBLAS_UPLO uplo[1] = {CblasLower}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; } - for (int i=0;i flush; - - /// - /// Reference version using MKL DTRSM - /// -#if defined(__KOKKOSBATCHED_INTEL_MKL__) - { - Kokkos::View - a("a", N*VectorLength, BlkSize, BlkSize), - b("b", N*VectorLength, BlkSize, NumCols); - - { - double tavg = 0, tmin = tmax; - for (int iter=iter_begin;iter policy(0, N*VectorLength); - Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::MKLOpenMP", policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasLower, CblasNoTrans, CblasUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 1: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 2: - cblas_dtrsm(CblasRowMajor, - CblasRight, CblasUpper, CblasNoTrans, CblasUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 3: - cblas_dtrsm(CblasRowMajor, - CblasRight, CblasUpper, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - case 4: - cblas_dtrsm(CblasRowMajor, - CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, - BlkSize, NumCols, - 1.0, - (double*)aa.data(), aa.stride_0(), - (double*)bb.data(), bb.stride_0()); - break; - } - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; + case 2: { + CBLAS_SIDE side[1] = {CblasRight}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; + } + case 3: { + CBLAS_SIDE side[1] = {CblasRight}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasNonUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; + } + case 4: { + CBLAS_SIDE side[1] = {CblasLeft}; + CBLAS_UPLO uplo[1] = {CblasUpper}; + CBLAS_TRANSPOSE transA[1] = {CblasNoTrans}; + CBLAS_DIAG diag[1] = {CblasNonUnit}; + + cblas_dtrsm_batch(CblasRowMajor, side, uplo, transA, diag, blksize, + numcols, one, (const double **)aa, lda, + (double **)bb, ldb, 1, size_per_grp); + break; } - tavg /= iter_end; - - double sum = 0; - for (int i=0,iend=b.extent(0);i::abs(bref(i, j, k) - + b(i, j, k)); + + std::cout << std::setw(10) << "MKL Batch" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; + } + } +#endif - MKL_INT lda[1] = { a.stride_1() }; - MKL_INT ldb[1] = { b.stride_1() }; +#if defined(__KOKKOSBATCHED_INTEL_MKL_COMPACT_BATCHED__) + { + Kokkos::View a( + "a", N, BlkSize, BlkSize), + b("b", N, BlkSize, NumCols); + + { + double tavg = 0, tmin = tmax; + + MKL_COMPACT_PACK format; - double one[1] = { 1.0 }; - MKL_INT size_per_grp[1] = { N*VectorLength }; + if (VectorLength == 4) + format = MKL_COMPACT_AVX; + else if (VectorLength == 8) + format = MKL_COMPACT_AVX512; - for (int iter=iter_begin;iter= 0)*t; } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i policy(0, N*VectorLength); - // Kokkos::parallel_for - // (policy, - // KOKKOS_LAMBDA(const int k) { - // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - // auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - // switch (test) { - // case 0: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 1: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 2: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 3: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // case 4: - // SerialTrsm:: - // invoke(1.0, aa, bb); - // break; - // } - // }); - - // HostSpaceType().fence(); - // const double t = timer.seconds(); - // tmin = std::min(tmin, t); - // tavg += (iter >= 0)*t; - // } - // tavg /= iter_end; - - // double diff = 0; - // for (int i=0,iend=bref.extent(0);i policy(0, N); - Kokkos::parallel_for("KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", - policy, - KOKKOS_LAMBDA(const int k) { - auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); - - switch (test) { - case 0: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 1: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 2: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 3: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - case 4: - SerialTrsm:: - invoke(1.0, aa, bb); - break; - } - }); - - HostSpaceType().fence(); - const double t = timer.seconds(); - tmin = std::min(tmin, t); - tavg += (iter >= 0)*t; - } - tavg /= iter_end; - - double diff = 0; - for (int i=0,iend=bref.extent(0);i policy(0, + // N*VectorLength); Kokkos::parallel_for + // (policy, + // KOKKOS_LAMBDA(const int k) { + // auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + // auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + // switch (test) { + // case 0: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 1: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 2: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 3: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // case 4: + // SerialTrsm:: + // invoke(1.0, aa, bb); + // break; + // } + // }); + + // HostSpaceType().fence(); + // const double t = timer.seconds(); + // tmin = std::min(tmin, t); + // tavg += (iter >= 0)*t; + // } + // tavg /= iter_end; + + // double diff = 0; + // for (int i=0,iend=bref.extent(0);i::abs(bref(i,j,k) - + // b(i,j,k)); + + // std::cout << std::setw(10) << "KK Scalar" + // << " BlkSize = " << std::setw(3) << BlkSize + // << " NumCols = " << std::setw(3) << NumCols + // << " time = " << std::scientific << tmin + // << " avg flop/s = " << (flop/tavg) + // << " max flop/s = " << (flop/tmin) + // << " diff to ref = " << diff + // << std::endl; + // } + // } + + /// + /// SIMD with appropriate data layout + /// + { + Kokkos::View a( + "a", N, BlkSize, BlkSize), + b("b", N, BlkSize, NumCols); + + { + double tavg = 0, tmin = tmax; + for (int iter = iter_begin; iter < iter_end; ++iter) { + // flush + flush.run(); + + // initialize matrices + Kokkos::deep_copy(a, amat_simd); + Kokkos::deep_copy(b, bmat_simd); + + HostSpaceType().fence(); + timer.reset(); + + Kokkos::RangePolicy policy(0, N); + Kokkos::parallel_for( + "KokkosBatched::PerfTest::TrsmHost::SIMDSerialOpenMP", policy, + KOKKOS_LAMBDA(const int k) { + auto aa = Kokkos::subview(a, k, Kokkos::ALL(), Kokkos::ALL()); + auto bb = Kokkos::subview(b, k, Kokkos::ALL(), Kokkos::ALL()); + + switch (test) { + case 0: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 1: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 2: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 3: + SerialTrsm::invoke(1.0, aa, bb); + break; + case 4: + SerialTrsm::invoke(1.0, aa, bb); + break; + } + }); + + HostSpaceType().fence(); + const double t = timer.seconds(); + tmin = std::min(tmin, t); + tavg += (iter >= 0) * t; } - std::cout << "\n\n"; + tavg /= iter_end; + + double diff = 0; + for (int i = 0, iend = bref.extent(0); i < iend; ++i) + for (int j = 0, jend = bref.extent(1); j < jend; ++j) + for (int k = 0, kend = bref.extent(2); k < kend; ++k) + diff += Kokkos::ArithTraits::abs( + bref(i, j, k) - b(i / VectorLength, j, k)[i % VectorLength]); + + std::cout << std::setw(10) << "KK Vector" + << " BlkSize = " << std::setw(3) << BlkSize + << " NumCols = " << std::setw(3) << NumCols + << " time = " << std::scientific << tmin + << " avg flop/s = " << (flop / tavg) + << " max flop/s = " << (flop / tmin) + << " diff to ref = " << diff << std::endl; } } + std::cout << "\n\n"; } - +} // namespace PerfTest +} // namespace KokkosBatched diff --git a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp index bb82e0e56d..3d45195bb1 100644 --- a/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp +++ b/perf_test/batched/do-not-use/KokkosBatched_Test_Trsm_Host_Real.cpp @@ -5,7 +5,7 @@ using namespace KokkosBatched; -template +template void run(const int N) { typedef Kokkos::DefaultHostExecutionSpace HostSpaceType; @@ -15,56 +15,55 @@ void run(const int N) { /// Left, Lower, NoTrans, UnitDiag (used in LU factorization and LU solve) - PerfTest::Trsm<0, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<0, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 15, 15, HostSpaceType, AlgoTagType>(N); /// Left, Lower, NoTrans, NonUnitDiag - PerfTest::Trsm<1, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<1,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<1, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<1, 15, 15, HostSpaceType, AlgoTagType>(N); /// Right, Upper, NoTrans, UnitDiag - PerfTest::Trsm<2, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<2,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<2, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<2, 15, 15, HostSpaceType, AlgoTagType>(N); /// Right, Upper, NoTrans, NonUnitDiag (used in LU factorization) - PerfTest::Trsm<3, 3, 3, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3, 5, 5, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3,10,10, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<3,15,15, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<3, 3, 3, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 5, 5, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 10, 10, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<3, 15, 15, HostSpaceType, AlgoTagType>(N); std::cout << "\n\n Used for Solve \n\n"; /// Left, Lower, NoTrans, UnitDiag (used in LU solve) - PerfTest::Trsm<0, 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0, 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<0,15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<0, 3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<0, 15, 1, HostSpaceType, AlgoTagType>(N); /// Left, Upper, Notrans, NonUnitDiag (user in LU solve) - PerfTest::Trsm<4, 3, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4, 5, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4,10, 1, HostSpaceType,AlgoTagType>(N); - PerfTest::Trsm<4,15, 1, HostSpaceType,AlgoTagType>(N); + PerfTest::Trsm<4, 3, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 5, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 10, 1, HostSpaceType, AlgoTagType>(N); + PerfTest::Trsm<4, 15, 1, HostSpaceType, AlgoTagType>(N); } -int main(int argc, char *argv[]) { - +int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) - int N = 128*128; + int N = 128 * 128; - for (int i=1;i(N); #endif - } #endif Kokkos::finalize(); diff --git a/perf_test/blas/KokkosBlas_blas1.cpp b/perf_test/blas/KokkosBlas_blas1.cpp index 01c6c430fa..764f800f39 100644 --- a/perf_test/blas/KokkosBlas_blas1.cpp +++ b/perf_test/blas/KokkosBlas_blas1.cpp @@ -46,10 +46,10 @@ #include #include #ifdef HAVE_MPI -# include +#include #else -# include -#endif // HAVE_MPI +#include +#endif // HAVE_MPI using Teuchos::Comm; using Teuchos::CommandLineProcessor; @@ -60,61 +60,58 @@ using Teuchos::TimeMonitor; // Create a new timer with the given name if it hasn't already been // created, else get the previously created timer with that name. -RCP